Source code for detectors.data.english_chars

import os
from typing import Callable, Optional

from torchvision.datasets import ImageFolder
from torchvision.datasets.utils import check_integrity, download_and_extract_archive


[docs]class EnglishChars(ImageFolder):
    """In the English language, Latin script (excluding accents) and Hindu-Arabic numerals are used.
    For simplicity we call this the "English" characters set. The dataset consists of:

    * 64 classes (0-9, A-Z, a-z)
    * 7705 characters obtained from natural images
    * 3410 hand drawn characters using a tablet PC
    * 62992 synthesised characters from computer fonts
    * This gives a total of over 74K images (which explains the name of the dataset).

    """

    base_folder = "chars74k"
    images_folder = "English/Img/GoodImg/Bmp/"
    filename = "EnglishImg.tgz"
    file_md5 = "85d157e0c58f998e1cda8def62bcda0d"
    url = "http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/EnglishImg.tgz"

    def __init__(
        self, root: str, split=None, transform: Optional[Callable] = None, download: bool = False, **kwargs
    ) -> None:
        self.root = os.path.expanduser(root)

        if download:
            self.download()

        if not self._check_integrity():
            raise RuntimeError("Dataset not found or corrupted." + " You can use download=True to download it")
        super().__init__(self._split_folder, transform=transform, **kwargs)

    @property
    def _dataset_folder(self):
        return os.path.join(self.root, self.base_folder)

    @property
    def _split_folder(self):
        return os.path.join(self._dataset_folder, self.images_folder)

    def _check_integrity(self) -> bool:
        root = self.root
        md5 = self.file_md5
        fpath = os.path.join(root, self.filename)
        return check_integrity(fpath, md5)

    def _check_exists(self) -> bool:
        return os.path.exists(self._split_folder)

[docs]    def download(self) -> None:
        if self._check_integrity() and self._check_exists():
            return
        download_and_extract_archive(
            self.url, download_root=self.root, extract_root=self._dataset_folder, md5=self.file_md5
        )