Source code for dacy.datasets.names

"""Helper functions for loading name dictionaries for person augmentation."""

import os
from typing import Optional

import pandas as pd


[docs]def load_names(
    min_count: int = 0,
    ethnicity: Optional[str] = None,  # type: ignore
    gender: Optional[str] = None,  # type: ignore
    min_prop_gender: float = 0,
) -> dict[str, list[str]]:  # type: ignore
    """Loads the names lookup table. Danish are from Danmarks statistik (2021).
    Muslim names are from Meldgaard (2005),
    https://nors.ku.dk/publikationer/webpublikationer/muslimske_fornavne/.

    Args:
        min_count (int, optional): Minimum number of occurences of the name for it to
            be included. Defaults to 0.
        ethnicity (Optional[str], optional): Which ethnicity should be included. None
            indicate all is included. Options include "muslim", "danish". Defaults to
            None.
        gender (Optional[str], optional): Which gender should be included. None indicate
            all is included. Options include "male", "female". Defaults to None.
        min_prop_gender (float): minimum probability of a name being a given gender.
            The probability of a given name being a specific gender is based on the
            proportion of people with the given name of that gender. Only used when
            gender is set. Defaults to 0.

    Returns:
        dict[str, list[str]]: A dictionary of names containing the keys
            "first_name" and "last_name".
    """
    path = os.path.join(  # noqa
        os.path.dirname(os.path.abspath(__file__)),  # noqa
        "lookup_tables",
        "names.csv",
    )
    names = pd.read_csv(path)

    if min_count:
        names = names.loc[names["count"] >= min_count]

    if ethnicity is not None:
        names = names.loc[names["ethnicity"] == ethnicity]

    last_names = names.loc[names["first_name"] == False]  # noqa
    if gender is not None:
        names = names.groupby(["name", "gender", "first_name"]).agg({"count": "sum"})
        # Change: groupby state_office and divide by sum
        names = names.groupby(level=0).apply(lambda x: x / float(x.sum().iloc[0]))
        names = names.reset_index(level=0, drop=True)
        names = names.reset_index()
        names = names.loc[
            (names["gender"] == gender) & (names["count"] >= min_prop_gender)
        ]

    first_names = names.loc[names["first_name"] == True]  # noqa
    return {
        "first_name": first_names.name.tolist(),
        "last_name": last_names.name.tolist(),
    }


[docs]def muslim_names() -> dict[str, list[str]]:  # type: ignore
    """Returns a dictionary of Muslim names.

    Returns:
        dict[str, list[str]]: A dictionary of Muslim names containing the keys
            "first_name" and "last_name". The list is derived from Meldgaard (2005),
            https://nors.ku.dk/publikationer/webpublikationer/muslimske_fornavne/.

    Example:
        >>> from dacy.datasets import muslim_names
        >>> names = muslim_names()
        >>> names["first_name"]
        >>> names["last_name"]
    """
    return load_names(ethnicity="muslim")


[docs]def danish_names() -> dict[str, list[str]]:  # type: ignore
    """Returns a dictionary of Danish names.

    Returns:
        dict[str, list[str]]: A dictionary of Danish names containing the keys
        "first_name" and "last_name". The list is derived from Danmarks statistik
        (2021).

    Example:
        >>> from dacy.datasets import danish_names
        >>> names = danish_names()
        >>> names["first_name"]
        >>> names["last_name"]
    """
    return load_names(ethnicity="danish")


[docs]def female_names() -> dict[str, list[str]]:  # type: ignore
    """Returns a dictionary of Danish female names.

    Returns:
        dict[str, list[str]]: A dictionary of names containing the keys "first_name"
            and "last_name". The list is derived from Danmarks statistik (2021).

    Example:
        >>> from dacy.datasets import female_names
        >>> names = female_names()
        >>> names["first_name"]
        >>> names["last_name"]
    """
    return load_names(ethnicity="danish", gender="female", min_prop_gender=0.5)


[docs]def male_names() -> dict[str, list[str]]:  # type: ignore
    """Returns a dictionary of Danish male names.

    Returns:
        dict[str, list[str]]: A dictionary of names containing the keys "first_name"
            and "last_name". The list is derived from Danmarks statistik (2021).

    Example:
        >>> from dacy.datasets import male_names
        >>> names = male_names()
        >>> names["first_name"]
        >>> names["last_name"]
    """
    return load_names(ethnicity="danish", gender="male", min_prop_gender=0.5)