Skip to content

The Bias Evaluator

In order to test the model on the coreference tasks follow this tutorial

Module for detecting gender bias in Danish language models.

Source code in genda_lens/genda_lens.py
class Evaluator:
    """Module for detecting gender bias in Danish language models."""

    def __init__(self, model_name):
        self.model_name = model_name
        print(
            f"[INFO] You can test {self.model_name} by running Evaluator.evaluate_<model type>()"
        )

    def evaluate_pretrained(
        self, test, mask_token=None, start_token=None, sep_token=None
    ):
        """Evaluate gender bias in a pre-trained model trained with masked language modeling.

        This function can be used for running two different tests:
        The Dawinobias Language Modeling Task and the ABC Language Modeling Task.
        Read more about the specifics of these test in the User Guide.

        Args:
            test (str): choose between "abc" or "dawinobias"
            mask_token (str, optional): mask token of tested model. Specify when running test "abc". Defaults to None.
            start_token (str, optional): start token of tested model. Specify when running test "abc". Defaults to None.
            sep_token (str, optional): sep token of tested model. Specify when running test "dawinobias". Defaults to None.

        Returns:
            list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

        *EXAMPLE*

           ```python
           from genda_lens import Evaluator

           # initiate evaluator
           ev = Evaluator(model_name="huggingface-modelname")

           # run abc test
           output = ev.evaluate_pretrained(test="abc", mask_token="<mask>", start_token="<s>", sep_token="</s>")

           # retrieve output
           simple_output = output[0]
           detailed_output = output[1]

           ```
        """
        import pandas as pd
        import spacy
        from transformers import pipeline

        from .lm_tasks.abc_utils import get_output, load_abc, load_mdl, run_abc
        from .lm_tasks.wino_utils import evaluate_lm_winobias, run_winobias

        ### RUN ABC
        # load data
        if test == "abc":
            if start_token is None:
                raise ValueError(
                    "Please specify input argument 'start_token'(str) when running the ABC language modeling task."
                )
            if sep_token is None:
                raise ValueError(
                    "Please specify input argument 'sep_token'(str) when running the ABC language modeling task."
                )
            else:
                pass

            print(f"[INFO] Running the ABC language modeling task on {self.model_name}")
            refl_sents_m, refl_sents_f = load_abc()
            # load tokenizer and model
            model, tokenizer = load_mdl(self.model_name)

            # create results df
            out_df_f = run_abc(
                refl_sents_f, "female", tokenizer, model, start_token, sep_token
            )
            out_df_m = run_abc(
                refl_sents_m, "male", tokenizer, model, start_token, sep_token
            )

            # evaluate abc
            results = get_output(out_df_f, out_df_m, model_name=self.model_name)

        elif test == "dawinobias":
            if mask_token is None:
                raise ValueError(
                    "Please specify input argument 'mask_token'(str) when running the DaWinobias language modeling task."
                )
            else:
                pass
            print(
                f"[INFO] Running the DaWinobias language modeling task on {self.model_name}"
            )
            # load model used for tokenization
            try:
                tokenizer = spacy.load("da_core_news_sm")
            except OSError:
                print("[INFO] Downloading tokenizer: da_core_news_sm from spaCy.")
                from spacy.cli.download import download

                download("da_core_news_sm")
                tokenizer = spacy.load("da_core_news_sm")

            # initiate pipeline
            print(f"[INFO] Loading model {self.model_name} from Hugging Face.")
            nlp = pipeline(task="fill-mask", model=self.model_name)

            # run wino
            clf_rep_anti, clf_rep_pro = run_winobias(
                tokenizer, nlp, mask_token=mask_token, model_name=self.model_name
            )

            results = evaluate_lm_winobias(
                clf_rep_anti, clf_rep_pro, model_name=self.model_name
            )

        else:
            raise ValueError("Not a valid test. Choose between 'abc' and 'dawinobias'")
        print("[INFO] Output generated.")
        return results

    def evaluate_ner(self, n):
        """Evaluate gender bias in a NER model.
        This function can be used for running the DaNe dataset test.
        Read more about the specifics of these test in the User Guide.

        Args:
            n (int): Number of repetitions to run the augmentation pipeline. To ensure robustness we recommend a value of n => 20.

        Returns:
            list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

        *EXAMPLE*

           ```python
            from genda_lens import Evaluator

            # initiate evaluator
            ev = Evaluator(model_name="huggingface-modelname")

            # run test
            output = ev.evaluate_ner(n=20)

            # retrieve output
            simple_output = output[0]
            detailed_output = output[1]

           ```
        """
        from dacy.datasets import dane

        from .ner_tasks.augmentation import f_aug, m_aug, muslim_f_aug, muslim_m_aug
        from .ner_tasks.performance import load_mdl, eval_model_augmentation

        testdata = dane(
            splits=["test"], redownload=True, open_unverified_connected=True
        )

        model = load_mdl(self.model_name)
        if n <= 1:
            print(
                f"[INFO] Please choose a value for n larger than 1 to ensure robustness, got: {n}."
            )
            print(
                f"[INFO] Running the NER task on {self.model_name} with low value for n."
            )
        else:
            print(f"[INFO] Running the NER task on {self.model_name}")

        # define augmenters
        augmenters = [
            (f_aug, "Majority female names", n),
            (m_aug, "Majority male names", n),
            (muslim_f_aug, "Minority female names", n),
            (muslim_m_aug, "Minority male names", n),
        ]

        # run model
        output = eval_model_augmentation(
            model, self.model_name, str(n), augmenters, testdata
        )
        print("[INFO] Output generated.")
        return output

    def evaluate_coref(self, test, model):
        """Evaluate gender bias in a coreference model.

        This function can be used for running two different tests:
        The Dawinobias Language Coreference Task and the ABC Coreference Task.
        Read more about the specifics of these test in the User Guide.

        Args:
            test (str): choose between "abc" or "dawinobias"
            model (_type_): a coreference model object.

        Returns:
            list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

        *EXAMPLE*

           ```python
           from genda_lens import Evaluator

           # load coref model
           from danlp import load_xlmr_coref_model
           model = load_xlmr_coref_model()

           # initiate evaluator
           ev = Evaluator(model_name="danlp-xlmr")

           # run abc test
           output = ev.evaluate_coref(test="abc", model=model)

           # retrieve output
           simple_output = output[0]
           detailed_output = output[1]
           ```
        """
        import os
        import random
        import sys

        # import json
        from pathlib import Path

        import numpy as np
        import pandas as pd
        import spacy
        import torch
        from sklearn.metrics import classification_report, f1_score

        if test == "dawinobias":
            import nltk
            from .coref_tasks.wino_utils import (
                evaluate_coref_winobias,
                run_winobias_coref,
            )

            nltk.download("omw-1.4")  # only wino
            # load model used for tokenization
            try:
                nlp = spacy.load("da_core_news_sm")
            except OSError:
                print("[INFO] Downloading tokenizer: da_core_news_sm from spaCy.")
                from spacy.cli.download import download

                download("da_core_news_sm")
                nlp = spacy.load("da_core_news_sm")

            print(
                f"[INFO] Running the DaWinobias coreference task on {self.model_name}"
            )
            anti_res, pro_res = run_winobias_coref(model, nlp)
            results = evaluate_coref_winobias(
                anti_res, pro_res, model_name=self.model_name
            )

        elif test == "abc":
            import pandas as pd
            from .coref_tasks.abc_utils import (
                eval_results,
                evaluate_coref_abc,
                run_abc_coref,
            )

            print(f"[INFO] Running the ABC coreference task on {self.model_name}")
            fem_preds, male_preds = run_abc_coref(model)
            # two dicts of f1 scores
            df_fem, df_male, all_sents = evaluate_coref_abc(
                fem_preds=fem_preds, male_preds=male_preds
            )

            results = eval_results(
                df_fem, df_male, all_sents, model_name=self.model_name
            )
        else:
            raise ValueError("Not a valid test. Choose between 'abc' and 'dawinobias'")
        print("[INFO] Output generated.")
        return results

evaluate_coref(test, model)

Evaluate gender bias in a coreference model.

This function can be used for running two different tests: The Dawinobias Language Coreference Task and the ABC Coreference Task. Read more about the specifics of these test in the User Guide.

Parameters:

Name Type Description Default
test str

choose between "abc" or "dawinobias"

required
model _type_

a coreference model object.

required

Returns:

Name Type Description
list df

Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

EXAMPLE

from genda_lens import Evaluator

# load coref model
from danlp import load_xlmr_coref_model
model = load_xlmr_coref_model()

# initiate evaluator
ev = Evaluator(model_name="danlp-xlmr")

# run abc test
output = ev.evaluate_coref(test="abc", model=model)

# retrieve output
simple_output = output[0]
detailed_output = output[1]
Source code in genda_lens/genda_lens.py
def evaluate_coref(self, test, model):
    """Evaluate gender bias in a coreference model.

    This function can be used for running two different tests:
    The Dawinobias Language Coreference Task and the ABC Coreference Task.
    Read more about the specifics of these test in the User Guide.

    Args:
        test (str): choose between "abc" or "dawinobias"
        model (_type_): a coreference model object.

    Returns:
        list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

    *EXAMPLE*

       ```python
       from genda_lens import Evaluator

       # load coref model
       from danlp import load_xlmr_coref_model
       model = load_xlmr_coref_model()

       # initiate evaluator
       ev = Evaluator(model_name="danlp-xlmr")

       # run abc test
       output = ev.evaluate_coref(test="abc", model=model)

       # retrieve output
       simple_output = output[0]
       detailed_output = output[1]
       ```
    """
    import os
    import random
    import sys

    # import json
    from pathlib import Path

    import numpy as np
    import pandas as pd
    import spacy
    import torch
    from sklearn.metrics import classification_report, f1_score

    if test == "dawinobias":
        import nltk
        from .coref_tasks.wino_utils import (
            evaluate_coref_winobias,
            run_winobias_coref,
        )

        nltk.download("omw-1.4")  # only wino
        # load model used for tokenization
        try:
            nlp = spacy.load("da_core_news_sm")
        except OSError:
            print("[INFO] Downloading tokenizer: da_core_news_sm from spaCy.")
            from spacy.cli.download import download

            download("da_core_news_sm")
            nlp = spacy.load("da_core_news_sm")

        print(
            f"[INFO] Running the DaWinobias coreference task on {self.model_name}"
        )
        anti_res, pro_res = run_winobias_coref(model, nlp)
        results = evaluate_coref_winobias(
            anti_res, pro_res, model_name=self.model_name
        )

    elif test == "abc":
        import pandas as pd
        from .coref_tasks.abc_utils import (
            eval_results,
            evaluate_coref_abc,
            run_abc_coref,
        )

        print(f"[INFO] Running the ABC coreference task on {self.model_name}")
        fem_preds, male_preds = run_abc_coref(model)
        # two dicts of f1 scores
        df_fem, df_male, all_sents = evaluate_coref_abc(
            fem_preds=fem_preds, male_preds=male_preds
        )

        results = eval_results(
            df_fem, df_male, all_sents, model_name=self.model_name
        )
    else:
        raise ValueError("Not a valid test. Choose between 'abc' and 'dawinobias'")
    print("[INFO] Output generated.")
    return results

evaluate_ner(n)

Evaluate gender bias in a NER model. This function can be used for running the DaNe dataset test. Read more about the specifics of these test in the User Guide.

Parameters:

Name Type Description Default
n int

Number of repetitions to run the augmentation pipeline. To ensure robustness we recommend a value of n => 20.

required

Returns:

Name Type Description
list df

Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

EXAMPLE

 from genda_lens import Evaluator

 # initiate evaluator
 ev = Evaluator(model_name="huggingface-modelname")

 # run test
 output = ev.evaluate_ner(n=20)

 # retrieve output
 simple_output = output[0]
 detailed_output = output[1]
Source code in genda_lens/genda_lens.py
def evaluate_ner(self, n):
    """Evaluate gender bias in a NER model.
    This function can be used for running the DaNe dataset test.
    Read more about the specifics of these test in the User Guide.

    Args:
        n (int): Number of repetitions to run the augmentation pipeline. To ensure robustness we recommend a value of n => 20.

    Returns:
        list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

    *EXAMPLE*

       ```python
        from genda_lens import Evaluator

        # initiate evaluator
        ev = Evaluator(model_name="huggingface-modelname")

        # run test
        output = ev.evaluate_ner(n=20)

        # retrieve output
        simple_output = output[0]
        detailed_output = output[1]

       ```
    """
    from dacy.datasets import dane

    from .ner_tasks.augmentation import f_aug, m_aug, muslim_f_aug, muslim_m_aug
    from .ner_tasks.performance import load_mdl, eval_model_augmentation

    testdata = dane(
        splits=["test"], redownload=True, open_unverified_connected=True
    )

    model = load_mdl(self.model_name)
    if n <= 1:
        print(
            f"[INFO] Please choose a value for n larger than 1 to ensure robustness, got: {n}."
        )
        print(
            f"[INFO] Running the NER task on {self.model_name} with low value for n."
        )
    else:
        print(f"[INFO] Running the NER task on {self.model_name}")

    # define augmenters
    augmenters = [
        (f_aug, "Majority female names", n),
        (m_aug, "Majority male names", n),
        (muslim_f_aug, "Minority female names", n),
        (muslim_m_aug, "Minority male names", n),
    ]

    # run model
    output = eval_model_augmentation(
        model, self.model_name, str(n), augmenters, testdata
    )
    print("[INFO] Output generated.")
    return output

evaluate_pretrained(test, mask_token=None, start_token=None, sep_token=None)

Evaluate gender bias in a pre-trained model trained with masked language modeling.

This function can be used for running two different tests: The Dawinobias Language Modeling Task and the ABC Language Modeling Task. Read more about the specifics of these test in the User Guide.

Parameters:

Name Type Description Default
test str

choose between "abc" or "dawinobias"

required
mask_token str

mask token of tested model. Specify when running test "abc". Defaults to None.

None
start_token str

start token of tested model. Specify when running test "abc". Defaults to None.

None
sep_token str

sep token of tested model. Specify when running test "dawinobias". Defaults to None.

None

Returns:

Name Type Description
list df

Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

EXAMPLE

from genda_lens import Evaluator

# initiate evaluator
ev = Evaluator(model_name="huggingface-modelname")

# run abc test
output = ev.evaluate_pretrained(test="abc", mask_token="<mask>", start_token="<s>", sep_token="</s>")

# retrieve output
simple_output = output[0]
detailed_output = output[1]
Source code in genda_lens/genda_lens.py
def evaluate_pretrained(
    self, test, mask_token=None, start_token=None, sep_token=None
):
    """Evaluate gender bias in a pre-trained model trained with masked language modeling.

    This function can be used for running two different tests:
    The Dawinobias Language Modeling Task and the ABC Language Modeling Task.
    Read more about the specifics of these test in the User Guide.

    Args:
        test (str): choose between "abc" or "dawinobias"
        mask_token (str, optional): mask token of tested model. Specify when running test "abc". Defaults to None.
        start_token (str, optional): start token of tested model. Specify when running test "abc". Defaults to None.
        sep_token (str, optional): sep token of tested model. Specify when running test "dawinobias". Defaults to None.

    Returns:
        list (df): Performance output as list. First element: performance in condensed form. Second element: performance in detailed form.

    *EXAMPLE*

       ```python
       from genda_lens import Evaluator

       # initiate evaluator
       ev = Evaluator(model_name="huggingface-modelname")

       # run abc test
       output = ev.evaluate_pretrained(test="abc", mask_token="<mask>", start_token="<s>", sep_token="</s>")

       # retrieve output
       simple_output = output[0]
       detailed_output = output[1]

       ```
    """
    import pandas as pd
    import spacy
    from transformers import pipeline

    from .lm_tasks.abc_utils import get_output, load_abc, load_mdl, run_abc
    from .lm_tasks.wino_utils import evaluate_lm_winobias, run_winobias

    ### RUN ABC
    # load data
    if test == "abc":
        if start_token is None:
            raise ValueError(
                "Please specify input argument 'start_token'(str) when running the ABC language modeling task."
            )
        if sep_token is None:
            raise ValueError(
                "Please specify input argument 'sep_token'(str) when running the ABC language modeling task."
            )
        else:
            pass

        print(f"[INFO] Running the ABC language modeling task on {self.model_name}")
        refl_sents_m, refl_sents_f = load_abc()
        # load tokenizer and model
        model, tokenizer = load_mdl(self.model_name)

        # create results df
        out_df_f = run_abc(
            refl_sents_f, "female", tokenizer, model, start_token, sep_token
        )
        out_df_m = run_abc(
            refl_sents_m, "male", tokenizer, model, start_token, sep_token
        )

        # evaluate abc
        results = get_output(out_df_f, out_df_m, model_name=self.model_name)

    elif test == "dawinobias":
        if mask_token is None:
            raise ValueError(
                "Please specify input argument 'mask_token'(str) when running the DaWinobias language modeling task."
            )
        else:
            pass
        print(
            f"[INFO] Running the DaWinobias language modeling task on {self.model_name}"
        )
        # load model used for tokenization
        try:
            tokenizer = spacy.load("da_core_news_sm")
        except OSError:
            print("[INFO] Downloading tokenizer: da_core_news_sm from spaCy.")
            from spacy.cli.download import download

            download("da_core_news_sm")
            tokenizer = spacy.load("da_core_news_sm")

        # initiate pipeline
        print(f"[INFO] Loading model {self.model_name} from Hugging Face.")
        nlp = pipeline(task="fill-mask", model=self.model_name)

        # run wino
        clf_rep_anti, clf_rep_pro = run_winobias(
            tokenizer, nlp, mask_token=mask_token, model_name=self.model_name
        )

        results = evaluate_lm_winobias(
            clf_rep_anti, clf_rep_pro, model_name=self.model_name
        )

    else:
        raise ValueError("Not a valid test. Choose between 'abc' and 'dawinobias'")
    print("[INFO] Output generated.")
    return results