Source code for dacy.download

"""Functions for downloading DaCy models."""
import os
from importlib.metadata import version
from pathlib import Path

from spacy.util import get_installed_models
from tqdm import tqdm  # type: ignore

DACY_DEFAULT_PATH = Path.home() / ".dacy"

DEFAULT_CACHE_DIR = os.getenv(
    "DACY_CACHE_DIR",
    DACY_DEFAULT_PATH,
)

models_url = {
    "da_dacy_small_trf-0.2.0": "https://huggingface.co/chcaa/da_dacy_small_trf/resolve/0eadea074d5f637e76357c46bbd56451471d0154/da_dacy_small_trf-any-py3-none-any.whl",
    "da_dacy_medium_trf-0.2.0": "https://huggingface.co/chcaa/da_dacy_medium_trf/resolve/e7dba91f855a1d26679dc1ef3aa49f7874b50543/da_dacy_medium_trf-any-py3-none-any.whl",
    "da_dacy_large_trf-0.2.0": "https://huggingface.co/chcaa/da_dacy_large_trf/resolve/963232f378190476503a1bfc35b520cb142e9e41/da_dacy_large_trf-any-py3-none-any.whl",
    "small": None,
    "medium": None,
    "large": None,
    "da_dacy_small_ner_fine_grained-0.1.0": "https://huggingface.co/chcaa/da_dacy_small_ner_fine_grained/resolve/43fedc5a1b1c1d193f461d13225f217f2ced507d/da_dacy_small_ner_fine_grained-any-py3-none-any.whl",
    "da_dacy_medium_ner_fine_grained-0.1.0": "https://huggingface.co/chcaa/da_dacy_medium_ner_fine_grained/resolve/4bfc4397b720acdb6428d64f18e90bfd439c80fc/da_dacy_medium_ner_fine_grained-any-py3-none-any.whl",
    "da_dacy_large_ner_fine_grained-0.1.0": "https://huggingface.co/chcaa/da_dacy_large_ner_fine_grained/resolve/08f973a1ff57120268bf30d3b7e7c4656ed25a58/da_dacy_large_ner_fine_grained-any-py3-none-any.whl",
}


[docs]def get_latest_version(model: str) -> str: """Returns the latest version of a DaCy model. Args: model: string indicating the model Returns: str: latest version of the model """ if model in {"small", "medium", "large"}: model = f"da_dacy_{model}_trf" versions = [mdl.split("-")[-1] for mdl in models_url if mdl.startswith(model)] versions = sorted( versions, key=lambda s: [int(u) for u in s.split(".")], # type: ignore reverse=True, ) return versions[0] # type: ignore
[docs]def models() -> list[str]: """Returns a list of valid DaCy models. Returns: list: list of valid DaCy models """ return list(models_url.keys())
class DownloadProgressBar(tqdm): def update_to(self, b: int = 1, bsize: int = 1, tsize=None) -> None: # noqa if tsize is not None: self.total = tsize self.update(b * bsize - self.n) def download_url(url: str, output_path: str) -> None: import urllib.request with DownloadProgressBar( unit="B", unit_scale=True, miniters=1, desc=url.split("/")[-1], # type: ignore ) as t: urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
[docs]def install(package: str, url: str) -> None: import subprocess import sys subprocess.check_call( [sys.executable, "-m", "pip", "install", f"{package} @ {url}", "--no-deps"], )
[docs]def download_model( model: str, force: bool = False, ) -> str: """Downloads and install a specified DaCy pipeline. Args: model: string indicating DaCy model, use dacy.models() to get a list of models. force: Should it download the model regardless of it already being present? Defaults to False. Returns: a string of the model location Example: >>> download_model(model="da_dacy_medium_trf-0.1.0") """ if model in {"small", "medium", "large"}: latest_version = get_latest_version(model) model = f"da_dacy_{model}_trf-{latest_version}" mdl_version = model.split("-")[-1] # type: ignore if model not in models_url: raise ValueError( f"The model '{model}' is not available in DaCy. Please use dacy.models() to see a" + " list of all models", ) mdl = model.split("-")[0] # type: ignore if mdl in get_installed_models() and not force and version(mdl) == mdl_version: return mdl package = model.split("-")[0] install(package, models_url[model]) return mdl