summaryrefslogtreecommitdiff
path: root/dataset.py
diff options
context:
space:
mode:
Diffstat (limited to 'dataset.py')
-rw-r--r--dataset.py33
1 files changed, 21 insertions, 12 deletions
diff --git a/dataset.py b/dataset.py
index 78d4691..e81e7ec 100644
--- a/dataset.py
+++ b/dataset.py
@@ -1,26 +1,35 @@
import numpy as np
from PIL import Image
from pathlib import Path
+from alphabet import CYRILLIC_ALPHABET
+import random
"""Class to interface the training and testing data."""
+DATASET_SIZE=15480
class Dataset:
def __init__(self) -> None:
self.data_path = Path('./data')
+ self.already_used = set()
- """Convert the dataset to a 2 dimension array."""
- def data(self):
- for dir in self.data_path.iterdir():
- if not dir.is_dir():
- continue
+ """
+ Yield a random sample of the dataset with each call.
+ """
+ def data(self, batch_size=DATASET_SIZE):
+ for i in range(batch_size):
+ random_letter = random.choice(CYRILLIC_ALPHABET)
+ images = list((self.data_path/random_letter).glob('*.png'))
+ file_to_yield = random.choice(images).name
- for file in dir.glob('*.png'):
- image = Image.open(str(file))
- image_array = self._img_to_array(image)
- # Return the image's pixel values as an array alongside
- # the character that it represents.
- yield (dir.name, image_array)
+ if file_to_yield in self.already_used:
+ continue
+
+ self.already_used.add(file_to_yield)
+ image = Image.open(str(self.data_path/random_letter/file_to_yield))
+ image_array = self._img_to_array(image)
+
+ yield (random_letter, image_array)
"""
Get an image from the dataset.
@@ -37,7 +46,7 @@ class Dataset:
Grab the image in RGB, add a white background, and return it as
a black and white array.
"""
- def _img_to_array(self, image: Image):
+ def _img_to_array(self, image):
fill_color = (255, 255, 255) # White background.
background = Image.new(image.mode[:-1], image.size, fill_color)
background.paste(image, image.split()[-1])