"""Helper functions for loading name dictionaries for person augmentation."""
import os
from typing import Dict, List, Optional
import pandas as pd
[docs]def load_names(
min_count: int = 0,
ethnicity: Optional[str] = None, # type: ignore
gender: Optional[str] = None, # type: ignore
min_prop_gender: float = 0,
) -> Dict[str, List[str]]: # type: ignore
"""Loads the names lookup table. Danish are from Danmarks statistik (2021).
Muslim names are from Meldgaard (2005),
https://nors.ku.dk/publikationer/webpublikationer/muslimske_fornavne/.
Args:
min_count (int, optional): Minimum number of occurences of the name for it to
be included. Defaults to 0.
ethnicity (Optional[str], optional): Which ethnicity should be included. None
indicate all is included. Options include "muslim", "danish". Defaults to
None.
gender (Optional[str], optional): Which gender should be included. None indicate
all is included. Options include "male", "female". Defaults to None.
min_prop_gender (float): minimum probability of a name being a given gender.
The probability of a given name being a specific gender is based on the
proportion of people with the given name of that gender. Only used when
gender is set. Defaults to 0.
Returns:
Dict[str, List[str]]: A dictionary of names containing the keys
"first_name" and "last_name".
"""
path = os.path.join( # noqa
os.path.dirname(os.path.abspath(__file__)), # noqa
"lookup_tables",
"names.csv",
)
names = pd.read_csv(path)
if min_count:
names = names.loc[names["count"] >= min_count]
if ethnicity is not None:
names = names.loc[names["ethnicity"] == ethnicity]
last_names = names.loc[names["first_name"] == False] # noqa
if gender is not None:
names = names.groupby(["name", "gender", "first_name"]).agg({"count": "sum"})
# Change: groupby state_office and divide by sum
names = names.groupby(level=0).apply(lambda x: x / float(x.sum().iloc[0]))
names = names.reset_index(level=0, drop=True)
names = names.reset_index()
names = names.loc[
(names["gender"] == gender) & (names["count"] >= min_prop_gender)
]
first_names = names.loc[names["first_name"] == True] # noqa
return {
"first_name": first_names.name.tolist(),
"last_name": last_names.name.tolist(),
}
[docs]def muslim_names() -> Dict[str, List[str]]: # type: ignore
"""Returns a dictionary of Muslim names.
Returns:
Dict[str, List[str]]: A dictionary of Muslim names containing the keys
"first_name" and "last_name". The list is derived from Meldgaard (2005),
https://nors.ku.dk/publikationer/webpublikationer/muslimske_fornavne/.
Example:
>>> from dacy.datasets import muslim_names
>>> names = muslim_names()
>>> names["first_name"]
>>> names["last_name"]
"""
return load_names(ethnicity="muslim")
[docs]def danish_names() -> Dict[str, List[str]]: # type: ignore
"""Returns a dictionary of Danish names.
Returns:
Dict[str, List[str]]: A dictionary of Danish names containing the keys
"first_name" and "last_name". The list is derived from Danmarks statistik
(2021).
Example:
>>> from dacy.datasets import danish_names
>>> names = danish_names()
>>> names["first_name"]
>>> names["last_name"]
"""
return load_names(ethnicity="danish")
[docs]def female_names() -> Dict[str, List[str]]: # type: ignore
"""Returns a dictionary of Danish female names.
Returns:
Dict[str, List[str]]: A dictionary of names containing the keys "first_name"
and "last_name". The list is derived from Danmarks statistik (2021).
Example:
>>> from dacy.datasets import female_names
>>> names = female_names()
>>> names["first_name"]
>>> names["last_name"]
"""
return load_names(ethnicity="danish", gender="female", min_prop_gender=0.5)
[docs]def male_names() -> Dict[str, List[str]]: # type: ignore
"""Returns a dictionary of Danish male names.
Returns:
Dict[str, List[str]]: A dictionary of names containing the keys "first_name"
and "last_name". The list is derived from Danmarks statistik (2021).
Example:
>>> from dacy.datasets import male_names
>>> names = male_names()
>>> names["first_name"]
>>> names["last_name"]
"""
return load_names(ethnicity="danish", gender="male", min_prop_gender=0.5)