The Bias Evaluator

In order to test the model on the coreference tasks follow this tutorial

Module for detecting gender bias in Danish language models.

Source code in genda_lens/genda_lens.py

class Evaluator:
    """Module for detecting gender bias in Danish language models."""

    def __init__(self, model_name):
        self.model_name = model_name
        print(
            f"[INFO] You can test {self.model_name} by running Evaluator.evaluate_<model type>()"
        )

    def evaluate_pretrained(
        self, test, mask_token=None, start_token=None, sep_token=None
    ):
        """Evaluate gender bias in a pre-trained model trained with masked language modeling.

        This function can be used for running two different tests:
        The Dawinobias Language Modeling Task and the ABC Language Modeling Task.
        Read more about the specifics of these test in the User Guide.

        Args:
            test (str): choose between "abc" or "dawinobias"
            mask_token (str, optional): mask token of tested model. Specify when running test "abc". Defaults to None.
            start_token (str, optional): start token of tested model. Specify when running test "abc". Defaults to None.
            sep_token (str, optional): sep token of tested model. Specify when running test "dawinobias". Defaults to None.

        Returns:
            list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

        *EXAMPLE*

           ```python
           from genda_lens import Evaluator

           # initiate evaluator
           ev = Evaluator(model_name="huggingface-modelname")

           # run abc test
           output = ev.evaluate_pretrained(test="abc", mask_token="<mask>", start_token="<s>", sep_token="</s>")

           # retrieve output
           simple_output = output[0]
           detailed_output = output[1]

           ```
        """
        import pandas as pd
        import spacy
        from transformers import pipeline

        from .lm_tasks.abc_utils import get_output, load_abc, load_mdl, run_abc
        from .lm_tasks.wino_utils import evaluate_lm_winobias, run_winobias

        ### RUN ABC
        # load data
        if test == "abc":
            if start_token is None:
                raise ValueError(
                    "Please specify input argument 'start_token'(str) when running the ABC language modeling task."
                )
            if sep_token is None:
                raise ValueError(
                    "Please specify input argument 'sep_token'(str) when running the ABC language modeling task."
                )
            else:
                pass

            print(f"[INFO] Running the ABC language modeling task on {self.model_name}")
            refl_sents_m, refl_sents_f = load_abc()
            # load tokenizer and model
            model, tokenizer = load_mdl(self.model_name)

            # create results df
            out_df_f = run_abc(
                refl_sents_f, "female", tokenizer, model, start_token, sep_token
            )
            out_df_m = run_abc(
                refl_sents_m, "male", tokenizer, model, start_token, sep_token
            )

            # evaluate abc
            results = get_output(out_df_f, out_df_m, model_name=self.model_name)

        elif test == "dawinobias":
            if mask_token is None:
                raise ValueError(
                    "Please specify input argument 'mask_token'(str) when running the DaWinobias language modeling task."
                )
            else:
                pass
            print(
                f"[INFO] Running the DaWinobias language modeling task on {self.model_name}"
            )
            # load model used for tokenization
            try:
                tokenizer = spacy.load("da_core_news_sm")
            except OSError:
                print("[INFO] Downloading tokenizer: da_core_news_sm from spaCy.")
                from spacy.cli.download import download

                download("da_core_news_sm")
                tokenizer = spacy.load("da_core_news_sm")

            # initiate pipeline
            print(f"[INFO] Loading model {self.model_name} from Hugging Face.")
            nlp = pipeline(task="fill-mask", model=self.model_name)

            # run wino
            clf_rep_anti, clf_rep_pro = run_winobias(
                tokenizer, nlp, mask_token=mask_token, model_name=self.model_name
            )

            results = evaluate_lm_winobias(
                clf_rep_anti, clf_rep_pro, model_name=self.model_name
            )

        else:
            raise ValueError("Not a valid test. Choose between 'abc' and 'dawinobias'")
        print("[INFO] Output generated.")
        return results

    def evaluate_ner(self, n):
        """Evaluate gender bias in a NER model.
        This function can be used for running the DaNe dataset test.
        Read more about the specifics of these test in the User Guide.

        Args:
            n (int): Number of repetitions to run the augmentation pipeline. To ensure robustness we recommend a value of n => 20.

        Returns:
            list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

        *EXAMPLE*

           ```python
            from genda_lens import Evaluator

            # initiate evaluator
            ev = Evaluator(model_name="huggingface-modelname")

            # run test
            output = ev.evaluate_ner(n=20)

            # retrieve output
            simple_output = output[0]
            detailed_output = output[1]

           ```
        """
        from dacy.datasets import dane

        from .ner_tasks.augmentation import f_aug, m_aug, muslim_f_aug, muslim_m_aug
        from .ner_tasks.performance import load_mdl, eval_model_augmentation

        testdata = dane(
            splits=["test"], redownload=True, open_unverified_connected=True
        )

        model = load_mdl(self.model_name)
        if n <= 1:
            print(
                f"[INFO] Please choose a value for n larger than 1 to ensure robustness, got: {n}."
            )
            print(
                f"[INFO] Running the NER task on {self.model_name} with low value for n."
            )
        else:
            print(f"[INFO] Running the NER task on {self.model_name}")

        # define augmenters
        augmenters = [
            (f_aug, "Majority female names", n),
            (m_aug, "Majority male names", n),
            (muslim_f_aug, "Minority female names", n),
            (muslim_m_aug, "Minority male names", n),
        ]

        # run model
        output = eval_model_augmentation(
            model, self.model_name, str(n), augmenters, testdata
        )
        print("[INFO] Output generated.")
        return output

    def evaluate_coref(self, test, model):
        """Evaluate gender bias in a coreference model.

        This function can be used for running two different tests:
        The Dawinobias Language Coreference Task and the ABC Coreference Task.
        Read more about the specifics of these test in the User Guide.

        Args:
            test (str): choose between "abc" or "dawinobias"
            model (_type_): a coreference model object.

        Returns:
            list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

        *EXAMPLE*

           ```python
           from genda_lens import Evaluator

           # load coref model
           from danlp import load_xlmr_coref_model
           model = load_xlmr_coref_model()

           # initiate evaluator
           ev = Evaluator(model_name="danlp-xlmr")

           # run abc test
           output = ev.evaluate_coref(test="abc", model=model)

           # retrieve output
           simple_output = output[0]
           detailed_output = output[1]
           ```
        """
        import os
        import random
        import sys

        # import json
        from pathlib import Path

        import numpy as np
        import pandas as pd
        import spacy
        import torch
        from sklearn.metrics import classification_report, f1_score

        if test == "dawinobias":
            import nltk
            from .coref_tasks.wino_utils import (
                evaluate_coref_winobias,
                run_winobias_coref,
            )

            nltk.download("omw-1.4")  # only wino
            # load model used for tokenization
            try:
                nlp = spacy.load("da_core_news_sm")
            except OSError:
                print("[INFO] Downloading tokenizer: da_core_news_sm from spaCy.")
                from spacy.cli.download import download

                download("da_core_news_sm")
                nlp = spacy.load("da_core_news_sm")

            print(
                f"[INFO] Running the DaWinobias coreference task on {self.model_name}"
            )
            anti_res, pro_res = run_winobias_coref(model, nlp)
            results = evaluate_coref_winobias(
                anti_res, pro_res, model_name=self.model_name
            )

        elif test == "abc":
            import pandas as pd
            from .coref_tasks.abc_utils import (
                eval_results,
                evaluate_coref_abc,
                run_abc_coref,
            )

            print(f"[INFO] Running the ABC coreference task on {self.model_name}")
            fem_preds, male_preds = run_abc_coref(model)
            # two dicts of f1 scores
            df_fem, df_male, all_sents = evaluate_coref_abc(
                fem_preds=fem_preds, male_preds=male_preds
            )

            results = eval_results(
                df_fem, df_male, all_sents, model_name=self.model_name
            )
        else:
            raise ValueError("Not a valid test. Choose between 'abc' and 'dawinobias'")
        print("[INFO] Output generated.")
        return results

`evaluate_coref(test, model)`

Evaluate gender bias in a coreference model.

This function can be used for running two different tests: The Dawinobias Language Coreference Task and the ABC Coreference Task. Read more about the specifics of these test in the User Guide.

Parameters:

Name	Type	Description	Default
`test`	`str`	choose between "abc" or "dawinobias"	required
`model`	`_type_`	a coreference model object.	required

Returns:

Name	Type	Description
`list`	`df`	Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

EXAMPLE

from genda_lens import Evaluator

# load coref model
from danlp import load_xlmr_coref_model
model = load_xlmr_coref_model()

# initiate evaluator
ev = Evaluator(model_name="danlp-xlmr")

# run abc test
output = ev.evaluate_coref(test="abc", model=model)

# retrieve output
simple_output = output[0]
detailed_output = output[1]

Source code in genda_lens/genda_lens.py

def evaluate_coref(self, test, model):
    """Evaluate gender bias in a coreference model.

    This function can be used for running two different tests:
    The Dawinobias Language Coreference Task and the ABC Coreference Task.
    Read more about the specifics of these test in the User Guide.

    Args:
        test (str): choose between "abc" or "dawinobias"
        model (_type_): a coreference model object.

    Returns:
        list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

    *EXAMPLE*

       ```python
       from genda_lens import Evaluator

       # load coref model
       from danlp import load_xlmr_coref_model
       model = load_xlmr_coref_model()

       # initiate evaluator
       ev = Evaluator(model_name="danlp-xlmr")

       # run abc test
       output = ev.evaluate_coref(test="abc", model=model)

       # retrieve output
       simple_output = output[0]
       detailed_output = output[1]
       ```
    """
    import os
    import random
    import sys

    # import json
    from pathlib import Path

    import numpy as np
    import pandas as pd
    import spacy
    import torch
    from sklearn.metrics import classification_report, f1_score

    if test == "dawinobias":
        import nltk
        from .coref_tasks.wino_utils import (
            evaluate_coref_winobias,
            run_winobias_coref,
        )

        nltk.download("omw-1.4")  # only wino
        # load model used for tokenization
        try:
            nlp = spacy.load("da_core_news_sm")
        except OSError:
            print("[INFO] Downloading tokenizer: da_core_news_sm from spaCy.")
            from spacy.cli.download import download

            download("da_core_news_sm")
            nlp = spacy.load("da_core_news_sm")

        print(
            f"[INFO] Running the DaWinobias coreference task on {self.model_name}"
        )
        anti_res, pro_res = run_winobias_coref(model, nlp)
        results = evaluate_coref_winobias(
            anti_res, pro_res, model_name=self.model_name
        )

    elif test == "abc":
        import pandas as pd
        from .coref_tasks.abc_utils import (
            eval_results,
            evaluate_coref_abc,
            run_abc_coref,
        )

        print(f"[INFO] Running the ABC coreference task on {self.model_name}")
        fem_preds, male_preds = run_abc_coref(model)
        # two dicts of f1 scores
        df_fem, df_male, all_sents = evaluate_coref_abc(
            fem_preds=fem_preds, male_preds=male_preds
        )

        results = eval_results(
            df_fem, df_male, all_sents, model_name=self.model_name
        )
    else:
        raise ValueError("Not a valid test. Choose between 'abc' and 'dawinobias'")
    print("[INFO] Output generated.")
    return results

`evaluate_ner(n)`

Evaluate gender bias in a NER model. This function can be used for running the DaNe dataset test. Read more about the specifics of these test in the User Guide.

Parameters:

Name	Type	Description	Default
`n`	`int`	Number of repetitions to run the augmentation pipeline. To ensure robustness we recommend a value of n => 20.	required

Returns:

Name	Type	Description
`list`	`df`	Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

EXAMPLE

 from genda_lens import Evaluator

 # initiate evaluator
 ev = Evaluator(model_name="huggingface-modelname")

 # run test
 output = ev.evaluate_ner(n=20)

 # retrieve output
 simple_output = output[0]
 detailed_output = output[1]

Source code in genda_lens/genda_lens.py

def evaluate_ner(self, n):
    """Evaluate gender bias in a NER model.
    This function can be used for running the DaNe dataset test.
    Read more about the specifics of these test in the User Guide.

    Args:
        n (int): Number of repetitions to run the augmentation pipeline. To ensure robustness we recommend a value of n => 20.

    Returns:
        list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

    *EXAMPLE*

       ```python
        from genda_lens import Evaluator

        # initiate evaluator
        ev = Evaluator(model_name="huggingface-modelname")

        # run test
        output = ev.evaluate_ner(n=20)

        # retrieve output
        simple_output = output[0]
        detailed_output = output[1]

       ```
    """
    from dacy.datasets import dane

    from .ner_tasks.augmentation import f_aug, m_aug, muslim_f_aug, muslim_m_aug
    from .ner_tasks.performance import load_mdl, eval_model_augmentation

    testdata = dane(
        splits=["test"], redownload=True, open_unverified_connected=True
    )

    model = load_mdl(self.model_name)
    if n <= 1:
        print(
            f"[INFO] Please choose a value for n larger than 1 to ensure robustness, got: {n}."
        )
        print(
            f"[INFO] Running the NER task on {self.model_name} with low value for n."
        )
    else:
        print(f"[INFO] Running the NER task on {self.model_name}")

    # define augmenters
    augmenters = [
        (f_aug, "Majority female names", n),
        (m_aug, "Majority male names", n),
        (muslim_f_aug, "Minority female names", n),
        (muslim_m_aug, "Minority male names", n),
    ]

    # run model
    output = eval_model_augmentation(
        model, self.model_name, str(n), augmenters, testdata
    )
    print("[INFO] Output generated.")
    return output

`evaluate_pretrained(test, mask_token=None, start_token=None, sep_token=None)`

Evaluate gender bias in a pre-trained model trained with masked language modeling.

This function can be used for running two different tests: The Dawinobias Language Modeling Task and the ABC Language Modeling Task. Read more about the specifics of these test in the User Guide.

Parameters:

Name	Type	Description	Default
`test`	`str`	choose between "abc" or "dawinobias"	required
`mask_token`	`str`	mask token of tested model. Specify when running test "abc". Defaults to None.	`None`
`start_token`	`str`	start token of tested model. Specify when running test "abc". Defaults to None.	`None`
`sep_token`	`str`	sep token of tested model. Specify when running test "dawinobias". Defaults to None.	`None`

Returns:

Name	Type	Description
`list`	`df`	Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

EXAMPLE

from genda_lens import Evaluator

# initiate evaluator
ev = Evaluator(model_name="huggingface-modelname")

# run abc test
output = ev.evaluate_pretrained(test="abc", mask_token="<mask>", start_token="<s>", sep_token="</s>")

# retrieve output
simple_output = output[0]
detailed_output = output[1]

Source code in genda_lens/genda_lens.py

def evaluate_pretrained(
    self, test, mask_token=None, start_token=None, sep_token=None
):
    """Evaluate gender bias in a pre-trained model trained with masked language modeling.

    This function can be used for running two different tests:
    The Dawinobias Language Modeling Task and the ABC Language Modeling Task.
    Read more about the specifics of these test in the User Guide.

    Args:
        test (str): choose between "abc" or "dawinobias"
        mask_token (str, optional): mask token of tested model. Specify when running test "abc". Defaults to None.
        start_token (str, optional): start token of tested model. Specify when running test "abc". Defaults to None.
        sep_token (str, optional): sep token of tested model. Specify when running test "dawinobias". Defaults to None.

    Returns:
        list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

    *EXAMPLE*

       ```python
       from genda_lens import Evaluator

       # initiate evaluator
       ev = Evaluator(model_name="huggingface-modelname")

       # run abc test
       output = ev.evaluate_pretrained(test="abc", mask_token="<mask>", start_token="<s>", sep_token="</s>")

       # retrieve output
       simple_output = output[0]
       detailed_output = output[1]

       ```
    """
    import pandas as pd
    import spacy
    from transformers import pipeline

    from .lm_tasks.abc_utils import get_output, load_abc, load_mdl, run_abc
    from .lm_tasks.wino_utils import evaluate_lm_winobias, run_winobias

    ### RUN ABC
    # load data
    if test == "abc":
        if start_token is None:
            raise ValueError(
                "Please specify input argument 'start_token'(str) when running the ABC language modeling task."
            )
        if sep_token is None:
            raise ValueError(
                "Please specify input argument 'sep_token'(str) when running the ABC language modeling task."
            )
        else:
            pass

        print(f"[INFO] Running the ABC language modeling task on {self.model_name}")
        refl_sents_m, refl_sents_f = load_abc()
        # load tokenizer and model
        model, tokenizer = load_mdl(self.model_name)

        # create results df
        out_df_f = run_abc(
            refl_sents_f, "female", tokenizer, model, start_token, sep_token
        )
        out_df_m = run_abc(
            refl_sents_m, "male", tokenizer, model, start_token, sep_token
        )

        # evaluate abc
        results = get_output(out_df_f, out_df_m, model_name=self.model_name)

    elif test == "dawinobias":
        if mask_token is None:
            raise ValueError(
                "Please specify input argument 'mask_token'(str) when running the DaWinobias language modeling task."
            )
        else:
            pass
        print(
            f"[INFO] Running the DaWinobias language modeling task on {self.model_name}"
        )
        # load model used for tokenization
        try:
            tokenizer = spacy.load("da_core_news_sm")
        except OSError:
            print("[INFO] Downloading tokenizer: da_core_news_sm from spaCy.")
            from spacy.cli.download import download

            download("da_core_news_sm")
            tokenizer = spacy.load("da_core_news_sm")

        # initiate pipeline
        print(f"[INFO] Loading model {self.model_name} from Hugging Face.")
        nlp = pipeline(task="fill-mask", model=self.model_name)

        # run wino
        clf_rep_anti, clf_rep_pro = run_winobias(
            tokenizer, nlp, mask_token=mask_token, model_name=self.model_name
        )

        results = evaluate_lm_winobias(
            clf_rep_anti, clf_rep_pro, model_name=self.model_name
        )

    else:
        raise ValueError("Not a valid test. Choose between 'abc' and 'dawinobias'")
    print("[INFO] Output generated.")
    return results