Source code for dacy.datasets.dane

"""This includes the DaNE dataset wrapped and read in as a SpaCy corpus."""

import shutil
import subprocess
import sys
from os import PathLike
from pathlib import Path
from typing import List, Optional, Union

from spacy.training.corpus import Corpus

from ..download import DEFAULT_CACHE_DIR, download_url
from .constants import DATASETS


[docs]def dane( # noqa save_path: Optional[PathLike] = None, # type: ignore splits: List[str] = ["train", "dev", "test"], # noqa # type: ignore redownload: bool = False, n_sents: int = 1, open_unverified_connection: bool = False, **kwargs, # noqa ) -> Union[List[Corpus], Corpus]: # type: ignore """Reads the DaNE dataset as a spacy Corpus. Args: save_path (str, optional): Path to the DaNE dataset If it does not contain the dataset it is downloaded to the folder. Defaults to None corresponding to dacy.where_is_my_dacy() in the datasets subfolder. splits (List[str], optional): Which splits of the dataset should be returned. Possible options include "train", "dev", "test", "all". Defaults to ["train", "dev", "test"]. redownload (bool, optional): Should the dataset be redownloaded. Defaults to False. n_sents (int, optional): Number of sentences per document. Only applied if the dataset is downloaded. Defaults to 1. open_unverified_connection (bool, optional): Should you download from an unverified connection. Defaults to False. force_extension (bool, optional): Set the extension to the doc regardless of whether it already exists. Defaults to False. Returns: Union[List[Corpus], Corpus]: Returns a SpaCy corpus or a list thereof. Example: >>> from dacy.datasets import dane >>> train, dev, test = dane(splits=["train", "dev", "test"]) """ if open_unverified_connection: import ssl ssl._create_default_https_context = ssl._create_unverified_context if save_path is None: save_path_ = Path(DEFAULT_CACHE_DIR) / "datasets" else: save_path_ = Path(save_path) save_path = save_path_ / "dane" if redownload is True or (not save_path.exists()): save_path.mkdir(parents=True, exist_ok=True) dl_path = save_path / "dane.zip" download_url(DATASETS["dane"], str(dl_path)) shutil.unpack_archive(dl_path, save_path) dl_path.unlink() wpaths = [ "dane_train.conllu", "dane_dev.conllu", "dane_test.conllu", "dane.conllu", ] for _wpath in wpaths: wpath = save_path / _wpath cpath = save_path / (wpath.stem + f"_{n_sents}") if cpath.with_suffix(".spacy").is_file(): continue cpath = cpath.with_suffix(".conllu") shutil.copyfile(wpath, cpath) # convert to spacy subprocess.run( [ sys.executable, "-m", "spacy", "convert", str(cpath), str(save_path), "--converter", "conllu", "--merge-subtokens", "-n", str(n_sents), ], check=True, ) cpath.unlink() if isinstance(splits, str): # type: ignore splits = [splits] # type: ignore corpora = [] paths = { "all": f"dane_{n_sents}.spacy", "test": f"dane_test_{n_sents}.spacy", "dev": f"dane_dev_{n_sents}.spacy", "train": f"dane_train_{n_sents}.spacy", } for split in splits: corpora.append(Corpus(save_path / paths[split])) # type: ignore if len(corpora) == 1: return corpora[0] # type: ignore return corpora