From 3f0e8c31bcf4eb9c1e9346a4275d535f2101b65c Mon Sep 17 00:00:00 2001 From: HombreLaser Date: Mon, 23 Oct 2023 17:53:59 -0600 Subject: Improve dataset class --- dataset.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'dataset.py') diff --git a/dataset.py b/dataset.py index 78d4691..e81e7ec 100644 --- a/dataset.py +++ b/dataset.py @@ -1,26 +1,35 @@ import numpy as np from PIL import Image from pathlib import Path +from alphabet import CYRILLIC_ALPHABET +import random """Class to interface the training and testing data.""" +DATASET_SIZE=15480 class Dataset: def __init__(self) -> None: self.data_path = Path('./data') + self.already_used = set() - """Convert the dataset to a 2 dimension array.""" - def data(self): - for dir in self.data_path.iterdir(): - if not dir.is_dir(): - continue + """ + Yield a random sample of the dataset with each call. + """ + def data(self, batch_size=DATASET_SIZE): + for i in range(batch_size): + random_letter = random.choice(CYRILLIC_ALPHABET) + images = list((self.data_path/random_letter).glob('*.png')) + file_to_yield = random.choice(images).name - for file in dir.glob('*.png'): - image = Image.open(str(file)) - image_array = self._img_to_array(image) - # Return the image's pixel values as an array alongside - # the character that it represents. - yield (dir.name, image_array) + if file_to_yield in self.already_used: + continue + + self.already_used.add(file_to_yield) + image = Image.open(str(self.data_path/random_letter/file_to_yield)) + image_array = self._img_to_array(image) + + yield (random_letter, image_array) """ Get an image from the dataset. @@ -37,7 +46,7 @@ class Dataset: Grab the image in RGB, add a white background, and return it as a black and white array. """ - def _img_to_array(self, image: Image): + def _img_to_array(self, image): fill_color = (255, 255, 255) # White background. background = Image.new(image.mode[:-1], image.size, fill_color) background.paste(image, image.split()[-1]) -- cgit v1.2.3