summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dataset.py33
-rw-r--r--neural_network.py6
2 files changed, 22 insertions, 17 deletions
diff --git a/dataset.py b/dataset.py
index 78d4691..e81e7ec 100644
--- a/dataset.py
+++ b/dataset.py
@@ -1,26 +1,35 @@
import numpy as np
from PIL import Image
from pathlib import Path
+from alphabet import CYRILLIC_ALPHABET
+import random
"""Class to interface the training and testing data."""
+DATASET_SIZE=15480
class Dataset:
def __init__(self) -> None:
self.data_path = Path('./data')
+ self.already_used = set()
- """Convert the dataset to a 2 dimension array."""
- def data(self):
- for dir in self.data_path.iterdir():
- if not dir.is_dir():
- continue
+ """
+ Yield a random sample of the dataset with each call.
+ """
+ def data(self, batch_size=DATASET_SIZE):
+ for i in range(batch_size):
+ random_letter = random.choice(CYRILLIC_ALPHABET)
+ images = list((self.data_path/random_letter).glob('*.png'))
+ file_to_yield = random.choice(images).name
- for file in dir.glob('*.png'):
- image = Image.open(str(file))
- image_array = self._img_to_array(image)
- # Return the image's pixel values as an array alongside
- # the character that it represents.
- yield (dir.name, image_array)
+ if file_to_yield in self.already_used:
+ continue
+
+ self.already_used.add(file_to_yield)
+ image = Image.open(str(self.data_path/random_letter/file_to_yield))
+ image_array = self._img_to_array(image)
+
+ yield (random_letter, image_array)
"""
Get an image from the dataset.
@@ -37,7 +46,7 @@ class Dataset:
Grab the image in RGB, add a white background, and return it as
a black and white array.
"""
- def _img_to_array(self, image: Image):
+ def _img_to_array(self, image):
fill_color = (255, 255, 255) # White background.
background = Image.new(image.mode[:-1], image.size, fill_color)
background.paste(image, image.split()[-1])
diff --git a/neural_network.py b/neural_network.py
index 6c44ddb..2163173 100644
--- a/neural_network.py
+++ b/neural_network.py
@@ -2,11 +2,7 @@ import numpy as np
import math
from scipy.special import expit
from secrets import token_hex
-
-CYRILLIC_ALPHABET = ['I', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ë', 'Ж', 'З',
- 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С',
- 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ъ', 'Ы',
- 'Ь', 'Э', 'Ю', 'Я']
+from alphabet import CYRILLIC_ALPHABET
"""The neural network class."""