Standard Library I

Start in JupyterHub

Not reinventing the wheel

# would you define pi manually?
# would you implement the cos function yourself?

pi = 3.1415926535

def cos(x):
    # terribly inaccurate
    return 1 - x**2 / 2 + x**4 / 24 - x**6 / 720 + x**8 / 40320 - x**10 / 3628800
    
cosine = cos(2 * pi * 45 / 360)
print(cosine)
0.7071067810877982
# or would you use the standard library?

import math

# this is accurate
cosine = math.cos(45 / 360 * 2 * math.pi)
print(cosine)
0.7071067811865476

Python comes “batteries included”. Search the standard library before you implement something yourself. Library code tends to be faster, more stable and better maintained. Also, library code has been checked many times by multiple people, so bugs are rare. When programming Python, you have to get a feel for what has already been implemented in the standard library. This chapter is meant to help with this.

random

https://docs.python.org/3/library/random.html

Never use this for cryptographic applications!

We implement some functions from the random module just to see how much time you can save by not implementing these yourself.

import random

# we require something indexable (O(1) access)
def choice(population):
    if len(population) == 0:
        raise IndexError("Cannot choose from an empty sequence")
    # index = random.randint(0, len(population) - 1)
    index = random.randrange(len(population))
    return population[index]
def choices(population, k):
    return [choice(population) for _ in range(k)]
def shuffle(seq):
    # fisher-yates shuffle
    for i in range(len(seq) - 1):
        j = random.randrange(i, len(seq))
        seq[i], seq[j] = seq[j], seq[i]
def sample(seq, k):
    # modified fisher-yates
    # alternative: reservoir sampling
    copy = list(seq)
    for i in range(k):
        j = random.randrange(i, len(copy))
        copy[i], copy[j] = copy[j], copy[i]
    return copy[:k]
sample(range(20), 5)
[11, 5, 10, 12, 15]

collections

https://docs.python.org/3/library/collections.html

# remember Mr. Potter from the first chapter?
harry = ("Potter", "Harry", 5)
from collections import namedtuple

# namedtuple is mostly self-documenting
# prefer namedtuple over regular tuples
Person = namedtuple('Person', ['lastname', 'firstname', 'school_year'])

harry = Person("Potter", "Harry", 5)
harry
Person(lastname='Potter', firstname='Harry', school_year=5)
harry.school_year
5
# unpacking is still possible
ln, fn, year = harry
ln
'Potter'

Assignment SL1

  1. Wörter zählen Schreiben Sie eine Funktion, die einen String nimmt, in Kleinbuchstaben umwandelt, an Leerzeichen trennt und dann die Anzahl der Wörter in einem dict zählt. Sie dürfen davon ausgehen, dass keine Sonderzeichen im String vorkommen. count_words("A rose is a rose is a rose") -> {'a': 3, 'rose': 3, 'is': 2}

def count_words(text):
    counter = {}
    for word in text.lower().strip().split():
        count = counter.get(word, 0)
        counter[word] = count + 1
    return counter
count_words("A rose is a rose is a rose")
{'a': 3, 'rose': 3, 'is': 2}
from collections import Counter

def count_words(text):
    counter = Counter()
    for word in text.lower().strip().split():
        counter[word] += 1
    return counter
count_words("A rose is a rose is a rose")
Counter({'a': 3, 'rose': 3, 'is': 2})
dict(count_words("A rose is a rose is a rose"))
{'a': 3, 'rose': 3, 'is': 2}
from collections import Counter

def count_words(text):
    return Counter(text.lower().split())
count_words("A rose is a rose is a rose")
Counter({'a': 3, 'rose': 3, 'is': 2})
  1. k häufigste Wörter Nutzen Sie die Funktion aus Teil 1, um die k häufigsten Wörter in einem String zu finden. Sie dürfen nach wie vor davon ausgehen, dass keine Sonderzeichen vorkommen. k_most_frequent("A rose is a rose is a rose", k=2) -> ["a", "rose"]

def k_most_frequent(text, k):
    return [word for word, count in count_words(text).most_common(k)]
k_most_frequent("A rose is a rose is a rose", k=2)
['a', 'rose']
# From Object Oriented Programming

import operator

from collections import Counter

class Heap:
    # you can also use a lambda function here
    # is_smaller=lambda lhs, rhs: lhs < rhs
    def __init__(self, iterable=None, *, is_smaller=operator.lt):
        self._buffer = []
        self._is_smaller = is_smaller
        if iterable is None:
            return
        for elem in iterable:
            self.add(elem)
    
    # NO CHANGES BELOW THIS LINE
    
    def _sift_up(self, idx):
        while idx > 0:
            parent = (idx - 1) // 2
            if not self._is_smaller(self._buffer[idx], self._buffer[parent]):
                break
            self._swap(idx, parent)
            idx = parent
    
    def _sift_down(self, idx):
        while True:
            left_child = idx * 2 + 1
            if left_child >= len(self._buffer):
                break
            smaller_child = left_child
            right_child = left_child + 1
            # wieso wird das nicht ersetzt
            if right_child < len(self._buffer):
                if self._is_smaller(self._buffer[right_child], self._buffer[left_child]):
                    smaller_child = right_child
            if self._is_smaller(self._buffer[smaller_child], self._buffer[idx]):
                self._swap(idx, smaller_child)
            else:
                break
                
    def _swap(self, i, j):
        self._buffer[i], self._buffer[j] = self._buffer[j], self._buffer[i]
    
    def add(self, value):
        self._buffer.append(value)
        last_index = len(self._buffer) - 1
        self._sift_up(last_index)
    
    def extract_min(self):
        if not self._buffer:
            raise IndexError("Extract from empty heap")
        minimum = self._buffer[0]
        self._swap(0, -1)
        # minimum = self._buffer.pop()
        self._buffer.pop()
        self._sift_down(0)
        return minimum
    
    def __len__(self):
        return len(self._buffer)
    
    def __bool__(self):
        return bool(self._buffer)
    
    # two heaps are equal if they contain the same elements the same number of times
    # see collections section
    def __eq__(self, other):
        return Counter(self._buffer) == Counter(other._buffer)
    
    def __ne__(self, other):
        return not self.__eq__(other)
    
    def __str__(self):
        return repr(self)
    
    # cannot represent self._compare as string
    def __repr__(self):
        return f"<binary min-heap with user-defined comparator and entries {self._buffer}>"
import operator

def k_most_frequent(text, k):
    words_with_freq = [(f, w) for w, f in count_words(text).items()]
    # actually, the heap only needs to store the top k entries in this case
    # so this is inefficient
    heap = Heap(words_with_freq, is_smaller=operator.gt)
    top_k = [heap.extract_min() for _ in range(k)]
    return [w for f, w in top_k]
k_most_frequent("A rose is a rose is a rose", k=2)
['rose', 'a']
import heapq

def k_most_frequent(text, k):
    words_with_freq = [(f, w) for w, f in count_words(text).items()]
    # this is more efficient
    top_k = heapq.nlargest(k, words_with_freq)
    return [w for f, w in top_k]
k_most_frequent("A rose is a rose is a rose", k=2)
['rose', 'a']
  1. Index-Backmapping ohne Duplikate Schreiben Sie eine Funktion, die eine Liste von Objekten nimmt und ein dict zurückgibt, das jedem Objekt seinen Index in der Liste zuordnet. Gehen Sie davon aus, dass kein Objekt doppelt vorkommt. find_index_nodups(["g", "t", "a"]) -> {'a': 2, 'g': 0, 't': 1}

# see SL1
def find_index_nodups(seq):
    index_map = {}
    for i in range(len(seq)):
        # requires indexing of seq
        index_map[seq[i]] = i
    return index_map
def find_index_nodups(seq):
    index_map = {}
    i = 0
    for entry in seq:
        # does not require indexing of seq
        index_map[entry] = i
        i += 1
    return index_map
def find_index_nodups(seq):
    index_map = {}
    # this eliminates the index counter
    for i, entry in enumerate(seq):
        index_map[entry] = i
    return index_map
def find_index_nodups(seq):
    return {entry: i for i, entry in enumerate(seq)}
find_index_nodups(["g", "t", "a"])
{'g': 0, 't': 1, 'a': 2}
  1. Index-Backmapping mit Duplikaten Wir erlauben jetzt Duplikate. Passen Sie die Funktion so an, dass das zurückgegebene dict jedem Objekt eine Liste der Indizes aller Vorkommen zuordnet. find_index(["g", "a", "a"]) -> {'a': [1, 2], 'g': [0]}

# dict comprehensions do not work in this case, back to for-loops
def find_index(seq):
    index_map = {}
    for i, entry in enumerate(seq):
        # lbyl - 2 lookups
        if entry in index_map:
            index_map[entry].append(i)
        else:
            index_map[entry] = [i]
    return index_map
def find_index(seq):
    index_map = {}
    for i, entry in enumerate(seq):
        # eafp - 1 lookup
        try:
            index_map[entry].append(i)
        except KeyError:
            index_map[entry] = [i]
    return index_map
from collections import defaultdict

def find_index(seq):
    index_map = defaultdict(list)
    for i, entry in enumerate(seq):
        index_map[entry].append(i)
    return dict(index_map)
find_index(["g", "a", "a"])
{'g': [0], 'a': [1, 2]}
  1. Lookup mit Fallback Schreiben Sie eine Funktion, die eine Liste von dicts, einen Key k und einen Default-Wert d nimmt. Die Funktion soll den Key k in den dicts der Reihe nach nachschlagen, bis ein Treffer gefunden wurde. Wenn keines der dicts den Key k enthält, soll stattdessen d zurückgegeben werden.

    config1 = {
        "width": 300,
        "height": 500,
        "color": "red"
    }
    config2 = {
        "color": "blue",
        "fullscreen": True
    }
    fallback_lookup([config2, config1], "fullscreen", False) -> True
    fallback_lookup([config2, config1], "width", 900) -> 300
    fallback_lookup([config2, config1], "color", "green") -> "blue"
    fallback_lookup([config2, config1], "minimized", False) -> False
    
def fallback_lookup(mappings, key, default):
    for m in mappings:
        try:
            return m[key]
        except KeyError:
            pass
    return default
from collections import ChainMap

def fallback_lookup(mappings, key, default):
    return ChainMap(*mappings).get(key, default)
config1 = {
    "width": 300,
    "height": 500,
    "color": "red"
}
config2 = {
    "color": "blue",
    "fullscreen": True
}
print(fallback_lookup([config2, config1], "fullscreen", False))
print(fallback_lookup([config2, config1], "width", 900))
print(fallback_lookup([config2, config1], "color", "green"))
print(fallback_lookup([config2, config1], "minimized", False))
True
300
blue
False

We can now use Counter to implement heap equality. Two heaps are equal if they contain the same elements the same number of times.

We will now experimentally verify deque runtimes (see https://wiki.python.org/moin/TimeComplexity)

%%timeit
test = []
for _ in range(100_000):
    test.append(42)
5.95 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = []
for _ in range(100_000):
    test.insert(0, 42)
1.23 s ± 8.23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
test = list(range(100_000))
for _ in range(100_000):
    test.pop()
9.07 ms ± 604 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = list(range(100_000))
for _ in range(100_000):
    del test[0]
758 ms ± 14.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
from collections import deque
%%timeit
test = deque()
for _ in range(100_000):
    test.append(42)
6.16 ms ± 107 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = deque()
for _ in range(100_000):
    test.appendleft(42)
6.05 ms ± 114 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = deque(range(100_000))
for _ in range(100_000):
    test.pop()
9.12 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = deque(range(100_000))
for _ in range(100_000):
    test.popleft()
9.38 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Assignment SL2

  1. Zufälliges Wort ziehen Gegeben sei ein dict mit Wörtern und ihren Häufigkeiten wie in den vorigen Aufgaben. Schreiben Sie eine Funktion, die ein zufälliges Wort aus den Wörtern zieht. Groß-/Kleinschreibung ist unwichtig. Doppelte Wörter sollen nicht höher gewichtet werden.

import random

def random_word(words_with_frequencies):
    words = list(words_with_frequencies.keys())
    return random.choice(words)
random_word(count_words("a a a a a a a a a a a a a a a b c"))
'a'
  1. Zufälliges Wort ziehen mit Gewichtung Schreiben Sie die Funktion aus Teil 1 so um, dass Wörter, die im Text häufiger vorkommen, mit einer höheren Wahrscheinlichkeit gezogen werden. Zum Beispiel soll ein Wort, das im Text dreimal vorkommt, dreimal so wahrscheinlich gezogen werden wie ein Wort, das nur einmal vorkommt.

import random

def random_word_weighted(words_with_frequencies):
    words = list(words_with_frequencies.keys())
    frequencies = list(words_with_frequencies.values())
    return random.choices(words, frequencies)[0]
random_word_weighted(count_words("a a a a a a a a a a a a a a a b c"))
'c'

sys & io

https://docs.python.org/3/library/sys.html

https://docs.python.org/3/library/io.html

from collections import namedtuple

Point2D = namedtuple('Point2D', ['x', 'y'])

Popular formats for storing data as plain text

  1. Number of records in first line

3
2 6
2.3 7.3
0.1 2.2
  1. Records end with EOF instead

2 6
2.3 7.3
0.1 2.2
def read_points_from_stdin_with_count():
    points = []
    for _ in range(int(input())):
        # this uses implicit input validation
        # this will raise an exception if the format is invalid
        # however, the cause of the exception will be difficult to trace
        # you should always validate your input explicitly
        x, y = input().split()
        points.append(Point2D(float(x), float(y)))
    return points

The sys module provides means of accessing the standard streams as well as command line arguments via argv. You can also use sys.exit(code) to exit the current program with a custom status code.

import sys

def read_points_from_stdin_without_count():
    points = []
    # iterate over lines
    for line in sys.stdin:
        line = line.strip()
        x, y = line.split()
        points.append(Point2D(float(x), float(y)))
    return points
def read_points_from_file_without_count(filename):
    points = []
    # more details on "with" later!
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            x, y = line.split()
            points.append(Point2D(float(x), float(y)))
    return points

These functions look very similar - this is not by accident! sys.stdin is file-like i.e. it behaves like a file.

# "file" is a file-like object
def read_points_from_file_like(file):
    points = []
    for line in file:
        line = line.strip()
        x, y = line.split()
        points.append(Point2D(float(x), float(y)))
    return points

Take a look inside io - streams are organized in a hierarchy of classes. Text streams are just variations of binary streams where binary data is interpreted according to a specified encoding.

import io

points = """\
4 5
2 7
1 -5
2 1
"""

# StringIO is also file-like
point_objects = read_points_from_file_like(io.StringIO(points))
point_objects
[Point2D(x=4.0, y=5.0),
 Point2D(x=2.0, y=7.0),
 Point2D(x=1.0, y=-5.0),
 Point2D(x=2.0, y=1.0)]

Assignment SL3 (with 🧑‍🏫-Bonus)

  1. Datei schreiben Schreiben Sie eine Funktion, die eine Liste von 2D-Punkten im besprochenen namedtuple-Format sowie ein file-like object nimmt und die Punkte im selben textbasierten Format speichert, das wir im Beispiel eingelesen haben.

Bonus🧑‍🏫: Take a look at zipfile and try to write a compressed file.

Bonus🧑‍🏫: Use pathlib instead of open.

def write_points(file, points):
    for p in points:
        file.write(f"{p.x} {p.y}\n")
with open("points.dump", "w") as file:
    write_points(file, point_objects)
from zipfile import ZipFile

points_string = "\n".join([f"{p.x} {p.y}" for p in point_objects])

with ZipFile("points.zip", "w") as zipfile:
    zipfile.writestr("/points.dump", points_string)
        
from pathlib import Path
with (Path(".")/"points.dump").open("w") as file:
    write_points(file, point_objects)

pickle

https://docs.python.org/3/library/pickle.html

pickle is Python’s binary serializer. It is straightforward to use, but files produced py pickle are generally not human-readable.

import pickle

points = [
    Point2D(x=4, y=5),
    Point2D(x=2, y=7),
    Point2D(x=1, y=-5),
    Point2D(x=2, y=1)
]

# binary format instead of text-based format
pickle.dumps(points)
b'\x80\x04\x95E\x00\x00\x00\x00\x00\x00\x00]\x94(\x8c\x08__main__\x94\x8c\x07Point2D\x94\x93\x94K\x04K\x05\x86\x94\x81\x94h\x03K\x02K\x07\x86\x94\x81\x94h\x03K\x01J\xfb\xff\xff\xff\x86\x94\x81\x94h\x03K\x02K\x01\x86\x94\x81\x94e.'
with open("points.pickle", 'wb') as file:
    # dump takes a file-like object
    pickle.dump(points, file)
with open("points.pickle", 'rb') as file:
    loaded_points = pickle.load(file)
    
print(loaded_points)
[Point2D(x=4, y=5), Point2D(x=2, y=7), Point2D(x=1, y=-5), Point2D(x=2, y=1)]

Bonus Assignment SL4 🧑‍🏫

  1. dict im Sekundärspeicher🧑‍🏫 Schreiben Sie eine Klasse SerializeDict, die sich nach außen wie ein dict verhält, aber alle Werte mithilfe von pickle im Sekundärspeicher (Datei auf Festplatte/SSD) ablegt.

MinHeap Without Duplicates

This version of the heap disallows duplicates, instead adjusting priority if the object is already contained in the heap. This is not possible when using heapq. Disallowing duplicates allows deleting objects from the heap, but forces the programmer to store the key separately. Using a class helps keeping the state of _buffer and _index consistent.

from collections import namedtuple

_Entry = namedtuple('_Entry', ['priority', 'value'])

# do not inherit from MinHeap -> liskov substitution principle
class MinHeapWithSeparatePriority:
    def __init__(self, iterable=None):
        self._buffer = []
        self._index = {}
        if iterable is None:
            return
        for value, priority in iterable:
            self.add(priority, value)
            
    def add(self, priority, value):
        try:
            existing_index = self._index[value]
        except KeyError:
            self._buffer.append(_Entry(priority, value))
            last_index = len(self._buffer) - 1
            self._index[value] = last_index
            self._sift_up(last_index)
        else:
            old_prio = self._buffer[existing_index].priority
            self._buffer[existing_index] = _Entry(priority, value)
            if old_prio < priority:
                self._sift_down(existing_index)
            else:
                self._sift_up(existing_index)
            
    def extract_min(self):
        if not self._buffer:
            raise IndexError("Extract from empty heap")
        minimum = self._buffer[0].value
        self._remove(0)
        return minimum
    
    def remove(self, value):
        try:
            existing_index = self._index[value]
        except KeyError:
            pass
        else:
            self._remove(existing_index)
                
    def _remove(self, index):
        value_at_index = self._buffer[index].value
        self._swap(index, -1)
        self._buffer.pop()
        del self._index[value_at_index]
        self._sift_down(index)

    def _sift_up(self, idx):
        while idx > 0:
            parent = (idx - 1) // 2
            if not (self._buffer[idx].priority < self._buffer[parent].priority):
                break
            self._swap(idx, parent)
            idx = parent
    
    def _sift_down(self, idx):
        while True:
            left_child = idx * 2 + 1
            if left_child >= len(self._buffer):
                break
            smaller_child = left_child
            right_child = left_child + 1
            if right_child < len(self._buffer):
                if self._buffer[right_child].priority < self._buffer[left_child].priority:
                    smaller_child = right_child
            
            if self._buffer[smaller_child].priority < self._buffer[idx].priority:
                self._swap(idx, smaller_child)
            else:
                break
                
    def _swap(self, i, j):
        self._buffer[i], self._buffer[j] = self._buffer[j], self._buffer[i]
        self._index[self._buffer[i]] = i
        self._index[self._buffer[j]] = j
        
    # see iterators chapter
    def __iter__(self):
        # return _MinHeapWithSeparatePriorityIterator(self)
        for entry in self._buffer:
            yield entry.value

    
    # len, bool, str, etc.
heap = MinHeapWithSeparatePriority()
heap.add(3, "b")
heap.add(5, "c")
heap.add(1, "a")
heap.add(0, "c")
print(heap.extract_min())
print(heap.extract_min())
print(heap.extract_min())
c
a
b

Assignment SL5

  1. Veränderbarer 2D-Punkt Später benötigen wir eine Implementierung von Point2D, deren Koordinaten x und y nachträglich verändert werden können. Dazu können wir leider nicht namedtuple verwenden, sondern müssen eine eigene Klasse schreiben. Implementieren Sie dazu den Konstuktor sowie __str__, __repr__, __eq__ und __ne__. An diesem Punkt sei auf das Modul dataclasses verwiesen, das allerdings in diesem Kurs nicht thematisiert wird, die Verwendung ist also freigestellt.

  2. Distanz zu anderen Punkten Erweitern Sie Ihre Klasse Point2D um eine Funktion dist(p), die die euklidische Distanz zu einem anderen Punkt p bestimmt.

import math

class Point2DMutable:
    __slots__ = '_x', '_y'
    # no defaults!
    def __init__(self, x, y):
        # no redundant state
        self._x = x
        self._y = y
    
    def __str__(self):
        return f"{self._x}, {self._y}"
    
    def __repr__(self):
        return f"Point2DMutable({self._x}, {self._y})"
    
    def __eq__(self, other):
        return (self._x, self._y) == (other._x, other._y)
    
    def __ne__(self, other):
        return not self.__eq__(other)
    
    def dist(self, p):
        return math.dist([self._x,  self._y], [p._y, p._y])

Assignment SL6 🧑‍🏫

  1. Kompliziertes Dateiformat Schreiben Sie eine Funktion, die das unten beschriebene Dateiformat aus einem übergebenen file-like object ausliest, alle Bedingungen an die Werte überprüft und dann als Liste von namedtuples zurückgibt. Im Fehlerfall soll eine geeignete Exception geworfen werden.

  2. Daten verarbeiten Lesen Sie das untenstehende Beispiel ein und berechnen Sie die Summe der Wahrscheinlichkeiten aller Datenzeilen.

Das Dateiformat besteht aus einer beliebigen Anzahl Zeilen. Eine solche Zeile ist entweder leer, ein Kommentar, oder eine Datenzeile.

Leere Zeilen dienen der Lesbarkeit sollen einfach ignoriert werden. Kommentarzeilen beginnen mit einem # und dienen der Dokumentation, diese sollen beim Einlesen ebenfalls ignoriert werden.

Datenzeilen enthalten drei durch Kommas getrennte Werte: Der erste Wert ist eine ganzzahlige, nichtnegative, innerhalb einer Datei eindeutige Nummerierung der Datenzeile. Der zweite Wert ist eine Wahrscheinlichkeitsangabe, also eine Fließkommazahl zwischen 0 und 1. Der dritte Wert soll Ja/Nein-Eigenschaften darstellen. Diese sind in einem String aus exakt 5 Zeichen kodiert. Erlaubte Zeichen sind “Y” und “N”.

Direkt vor und nach Kommas sowie am Zeilenanfang und -ende dürfen Leerzeichen stehen, damit die Spalten aneinander ausgerichtet werden können.

# id, probability, properties
  0,  0.25,        YYYNN

  1,  0.1,         NNNNY
# very probable
  4,  0.6,         YYYYY
from collections import namedtuple


Row = namedtuple("Row", ["id", "probability", "properties"])

class ParseError(Exception):
    pass


def _parse_id(s):
    try:
        i = int(s)
    except ValueError as e:
        raise ParseError("ID must be numeric") from e
    if i < 0:
        raise ParseError("ID must be positive")
    return i


def _parse_probability(s):
    try:
        f = float(s)
    except ValueError as e:
        raise ParseError("probability must be a valid number") from e
    if f > 1 or f < 0:
        raise ParseError("probability must be between 0 and 1")
    return f
    

def _parse_properties(s):
    if len(s) != 5:
        raise ParseError("properties must be exactly 5 characters long") from e
    for c in s:
        if c not in "YN":
            raise ParseError("properties can only consist of 'Y' and 'N'") from e
    return s  


def _parse_line(line):
    entries = line.split(",")
    if len(entries) < 3:
        raise ParseError("Not enough values")
    if len(entries) > 3:
        raise ParseError("Too many values")

    id_ = _parse_id(entries[0].strip())
    probability = _parse_probability(entries[1].strip())
    properties = _parse_properties(entries[2].strip())
    return id_, probability, properties
    

def parse_file(file):
    seen = set()
    rows = []
    for line in file:
        line = line.strip()
        if not line:
            continue
        if line[0] == '#':
            continue
        
        id_, probability, properties = _parse_line(line)
        
        if id_ in seen:
            raise ParseError(f"ID not unique: {id_}")
        seen.add(id_)
        rows.append(Row(id_, probability, properties))
    return rows
rows = parse_file(io.StringIO("""\
# id, probability, properties
  0,  0.25,        YYYNN

  1,  0.1,         NNNNY
# very probable
  4,  0.6,         YYYYY
"""))
rows
[Row(id=0, probability=0.25, properties='YYYNN'),
 Row(id=1, probability=0.1, properties='NNNNY'),
 Row(id=4, probability=0.6, properties='YYYYY')]