Standard Library I¶
Not reinventing the wheel¶
# would you define pi manually?
# would you implement the cos function yourself?
pi = 3.1415926535
def cos(x):
# terribly inaccurate
return 1 - x**2 / 2 + x**4 / 24 - x**6 / 720 + x**8 / 40320 - x**10 / 3628800
cosine = cos(2 * pi * 45 / 360)
print(cosine)
0.7071067810877982
# or would you use the standard library?
import math
# this is accurate
cosine = math.cos(45 / 360 * 2 * math.pi)
print(cosine)
0.7071067811865476
Python comes “batteries included”. Search the standard library before you implement something yourself. Library code tends to be faster, more stable and better maintained. Also, library code has been checked many times by multiple people, so bugs are rare. When programming Python, you have to get a feel for what has already been implemented in the standard library. This chapter is meant to help with this.
random
¶
https://docs.python.org/3/library/random.html
Never use this for cryptographic applications!
We implement some functions from the random
module just to see how much time you can save by not implementing these yourself.
import random
# we require something indexable (O(1) access)
def choice(population):
if len(population) == 0:
raise IndexError("Cannot choose from an empty sequence")
# index = random.randint(0, len(population) - 1)
index = random.randrange(len(population))
return population[index]
def choices(population, k):
return [choice(population) for _ in range(k)]
def shuffle(seq):
# fisher-yates shuffle
for i in range(len(seq) - 1):
j = random.randrange(i, len(seq))
seq[i], seq[j] = seq[j], seq[i]
def sample(seq, k):
# modified fisher-yates
# alternative: reservoir sampling
copy = list(seq)
for i in range(k):
j = random.randrange(i, len(copy))
copy[i], copy[j] = copy[j], copy[i]
return copy[:k]
sample(range(20), 5)
[11, 5, 10, 12, 15]
collections
¶
https://docs.python.org/3/library/collections.html
# remember Mr. Potter from the first chapter?
harry = ("Potter", "Harry", 5)
from collections import namedtuple
# namedtuple is mostly self-documenting
# prefer namedtuple over regular tuples
Person = namedtuple('Person', ['lastname', 'firstname', 'school_year'])
harry = Person("Potter", "Harry", 5)
harry
Person(lastname='Potter', firstname='Harry', school_year=5)
harry.school_year
5
# unpacking is still possible
ln, fn, year = harry
ln
'Potter'
Assignment SL1¶
Wörter zählen Schreiben Sie eine Funktion, die einen String nimmt, in Kleinbuchstaben umwandelt, an Leerzeichen trennt und dann die Anzahl der Wörter in einem
dict
zählt. Sie dürfen davon ausgehen, dass keine Sonderzeichen im String vorkommen.count_words("A rose is a rose is a rose") -> {'a': 3, 'rose': 3, 'is': 2}
def count_words(text):
counter = {}
for word in text.lower().strip().split():
count = counter.get(word, 0)
counter[word] = count + 1
return counter
count_words("A rose is a rose is a rose")
{'a': 3, 'rose': 3, 'is': 2}
from collections import Counter
def count_words(text):
counter = Counter()
for word in text.lower().strip().split():
counter[word] += 1
return counter
count_words("A rose is a rose is a rose")
Counter({'a': 3, 'rose': 3, 'is': 2})
dict(count_words("A rose is a rose is a rose"))
{'a': 3, 'rose': 3, 'is': 2}
from collections import Counter
def count_words(text):
return Counter(text.lower().split())
count_words("A rose is a rose is a rose")
Counter({'a': 3, 'rose': 3, 'is': 2})
k häufigste Wörter Nutzen Sie die Funktion aus Teil 1, um die k häufigsten Wörter in einem String zu finden. Sie dürfen nach wie vor davon ausgehen, dass keine Sonderzeichen vorkommen.
k_most_frequent("A rose is a rose is a rose", k=2) -> ["a", "rose"]
def k_most_frequent(text, k):
return [word for word, count in count_words(text).most_common(k)]
k_most_frequent("A rose is a rose is a rose", k=2)
['a', 'rose']
# From Object Oriented Programming
import operator
from collections import Counter
class Heap:
# you can also use a lambda function here
# is_smaller=lambda lhs, rhs: lhs < rhs
def __init__(self, iterable=None, *, is_smaller=operator.lt):
self._buffer = []
self._is_smaller = is_smaller
if iterable is None:
return
for elem in iterable:
self.add(elem)
# NO CHANGES BELOW THIS LINE
def _sift_up(self, idx):
while idx > 0:
parent = (idx - 1) // 2
if not self._is_smaller(self._buffer[idx], self._buffer[parent]):
break
self._swap(idx, parent)
idx = parent
def _sift_down(self, idx):
while True:
left_child = idx * 2 + 1
if left_child >= len(self._buffer):
break
smaller_child = left_child
right_child = left_child + 1
# wieso wird das nicht ersetzt
if right_child < len(self._buffer):
if self._is_smaller(self._buffer[right_child], self._buffer[left_child]):
smaller_child = right_child
if self._is_smaller(self._buffer[smaller_child], self._buffer[idx]):
self._swap(idx, smaller_child)
else:
break
def _swap(self, i, j):
self._buffer[i], self._buffer[j] = self._buffer[j], self._buffer[i]
def add(self, value):
self._buffer.append(value)
last_index = len(self._buffer) - 1
self._sift_up(last_index)
def extract_min(self):
if not self._buffer:
raise IndexError("Extract from empty heap")
minimum = self._buffer[0]
self._swap(0, -1)
# minimum = self._buffer.pop()
self._buffer.pop()
self._sift_down(0)
return minimum
def __len__(self):
return len(self._buffer)
def __bool__(self):
return bool(self._buffer)
# two heaps are equal if they contain the same elements the same number of times
# see collections section
def __eq__(self, other):
return Counter(self._buffer) == Counter(other._buffer)
def __ne__(self, other):
return not self.__eq__(other)
def __str__(self):
return repr(self)
# cannot represent self._compare as string
def __repr__(self):
return f"<binary min-heap with user-defined comparator and entries {self._buffer}>"
import operator
def k_most_frequent(text, k):
words_with_freq = [(f, w) for w, f in count_words(text).items()]
# actually, the heap only needs to store the top k entries in this case
# so this is inefficient
heap = Heap(words_with_freq, is_smaller=operator.gt)
top_k = [heap.extract_min() for _ in range(k)]
return [w for f, w in top_k]
k_most_frequent("A rose is a rose is a rose", k=2)
['rose', 'a']
import heapq
def k_most_frequent(text, k):
words_with_freq = [(f, w) for w, f in count_words(text).items()]
# this is more efficient
top_k = heapq.nlargest(k, words_with_freq)
return [w for f, w in top_k]
k_most_frequent("A rose is a rose is a rose", k=2)
['rose', 'a']
Index-Backmapping ohne Duplikate Schreiben Sie eine Funktion, die eine Liste von Objekten nimmt und ein
dict
zurückgibt, das jedem Objekt seinen Index in der Liste zuordnet. Gehen Sie davon aus, dass kein Objekt doppelt vorkommt.find_index_nodups(["g", "t", "a"]) -> {'a': 2, 'g': 0, 't': 1}
# see SL1
def find_index_nodups(seq):
index_map = {}
for i in range(len(seq)):
# requires indexing of seq
index_map[seq[i]] = i
return index_map
def find_index_nodups(seq):
index_map = {}
i = 0
for entry in seq:
# does not require indexing of seq
index_map[entry] = i
i += 1
return index_map
def find_index_nodups(seq):
index_map = {}
# this eliminates the index counter
for i, entry in enumerate(seq):
index_map[entry] = i
return index_map
def find_index_nodups(seq):
return {entry: i for i, entry in enumerate(seq)}
find_index_nodups(["g", "t", "a"])
{'g': 0, 't': 1, 'a': 2}
Index-Backmapping mit Duplikaten Wir erlauben jetzt Duplikate. Passen Sie die Funktion so an, dass das zurückgegebene
dict
jedem Objekt eine Liste der Indizes aller Vorkommen zuordnet.find_index(["g", "a", "a"]) -> {'a': [1, 2], 'g': [0]}
# dict comprehensions do not work in this case, back to for-loops
def find_index(seq):
index_map = {}
for i, entry in enumerate(seq):
# lbyl - 2 lookups
if entry in index_map:
index_map[entry].append(i)
else:
index_map[entry] = [i]
return index_map
def find_index(seq):
index_map = {}
for i, entry in enumerate(seq):
# eafp - 1 lookup
try:
index_map[entry].append(i)
except KeyError:
index_map[entry] = [i]
return index_map
from collections import defaultdict
def find_index(seq):
index_map = defaultdict(list)
for i, entry in enumerate(seq):
index_map[entry].append(i)
return dict(index_map)
find_index(["g", "a", "a"])
{'g': [0], 'a': [1, 2]}
Lookup mit Fallback Schreiben Sie eine Funktion, die eine Liste von
dict
s, einen Key k und einen Default-Wert d nimmt. Die Funktion soll den Key k in dendict
s der Reihe nach nachschlagen, bis ein Treffer gefunden wurde. Wenn keines derdict
s den Key k enthält, soll stattdessen d zurückgegeben werden.config1 = { "width": 300, "height": 500, "color": "red" } config2 = { "color": "blue", "fullscreen": True } fallback_lookup([config2, config1], "fullscreen", False) -> True fallback_lookup([config2, config1], "width", 900) -> 300 fallback_lookup([config2, config1], "color", "green") -> "blue" fallback_lookup([config2, config1], "minimized", False) -> False
def fallback_lookup(mappings, key, default):
for m in mappings:
try:
return m[key]
except KeyError:
pass
return default
from collections import ChainMap
def fallback_lookup(mappings, key, default):
return ChainMap(*mappings).get(key, default)
config1 = {
"width": 300,
"height": 500,
"color": "red"
}
config2 = {
"color": "blue",
"fullscreen": True
}
print(fallback_lookup([config2, config1], "fullscreen", False))
print(fallback_lookup([config2, config1], "width", 900))
print(fallback_lookup([config2, config1], "color", "green"))
print(fallback_lookup([config2, config1], "minimized", False))
True
300
blue
False
We can now use Counter
to implement heap equality. Two heaps are equal if they contain the same elements the same number of times.
We will now experimentally verify deque
runtimes (see https://wiki.python.org/moin/TimeComplexity)
%%timeit
test = []
for _ in range(100_000):
test.append(42)
5.95 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = []
for _ in range(100_000):
test.insert(0, 42)
1.23 s ± 8.23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
test = list(range(100_000))
for _ in range(100_000):
test.pop()
9.07 ms ± 604 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = list(range(100_000))
for _ in range(100_000):
del test[0]
758 ms ± 14.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
from collections import deque
%%timeit
test = deque()
for _ in range(100_000):
test.append(42)
6.16 ms ± 107 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = deque()
for _ in range(100_000):
test.appendleft(42)
6.05 ms ± 114 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = deque(range(100_000))
for _ in range(100_000):
test.pop()
9.12 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
test = deque(range(100_000))
for _ in range(100_000):
test.popleft()
9.38 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Assignment SL2¶
Zufälliges Wort ziehen Gegeben sei ein
dict
mit Wörtern und ihren Häufigkeiten wie in den vorigen Aufgaben. Schreiben Sie eine Funktion, die ein zufälliges Wort aus den Wörtern zieht. Groß-/Kleinschreibung ist unwichtig. Doppelte Wörter sollen nicht höher gewichtet werden.
import random
def random_word(words_with_frequencies):
words = list(words_with_frequencies.keys())
return random.choice(words)
random_word(count_words("a a a a a a a a a a a a a a a b c"))
'a'
Zufälliges Wort ziehen mit Gewichtung Schreiben Sie die Funktion aus Teil 1 so um, dass Wörter, die im Text häufiger vorkommen, mit einer höheren Wahrscheinlichkeit gezogen werden. Zum Beispiel soll ein Wort, das im Text dreimal vorkommt, dreimal so wahrscheinlich gezogen werden wie ein Wort, das nur einmal vorkommt.
import random
def random_word_weighted(words_with_frequencies):
words = list(words_with_frequencies.keys())
frequencies = list(words_with_frequencies.values())
return random.choices(words, frequencies)[0]
random_word_weighted(count_words("a a a a a a a a a a a a a a a b c"))
'c'
sys
& io
¶
https://docs.python.org/3/library/sys.html
https://docs.python.org/3/library/io.html
from collections import namedtuple
Point2D = namedtuple('Point2D', ['x', 'y'])
Popular formats for storing data as plain text
Number of records in first line
3
2 6
2.3 7.3
0.1 2.2
Records end with EOF instead
2 6
2.3 7.3
0.1 2.2
def read_points_from_stdin_with_count():
points = []
for _ in range(int(input())):
# this uses implicit input validation
# this will raise an exception if the format is invalid
# however, the cause of the exception will be difficult to trace
# you should always validate your input explicitly
x, y = input().split()
points.append(Point2D(float(x), float(y)))
return points
The sys
module provides means of accessing the standard streams as well as command line arguments via argv
. You can also use sys.exit(code)
to exit the current program with a custom status code.
import sys
def read_points_from_stdin_without_count():
points = []
# iterate over lines
for line in sys.stdin:
line = line.strip()
x, y = line.split()
points.append(Point2D(float(x), float(y)))
return points
def read_points_from_file_without_count(filename):
points = []
# more details on "with" later!
with open(filename, 'r') as file:
for line in file:
line = line.strip()
x, y = line.split()
points.append(Point2D(float(x), float(y)))
return points
These functions look very similar - this is not by accident! sys.stdin
is file-like i.e. it behaves like a file.
# "file" is a file-like object
def read_points_from_file_like(file):
points = []
for line in file:
line = line.strip()
x, y = line.split()
points.append(Point2D(float(x), float(y)))
return points
Take a look inside io
- streams are organized in a hierarchy of classes. Text streams are just variations of binary streams where binary data is interpreted according to a specified encoding.
import io
points = """\
4 5
2 7
1 -5
2 1
"""
# StringIO is also file-like
point_objects = read_points_from_file_like(io.StringIO(points))
point_objects
[Point2D(x=4.0, y=5.0),
Point2D(x=2.0, y=7.0),
Point2D(x=1.0, y=-5.0),
Point2D(x=2.0, y=1.0)]
Assignment SL3 (with 🧑🏫-Bonus)¶
Datei schreiben Schreiben Sie eine Funktion, die eine Liste von 2D-Punkten im besprochenen
namedtuple
-Format sowie ein file-like object nimmt und die Punkte im selben textbasierten Format speichert, das wir im Beispiel eingelesen haben.
Bonus🧑🏫: Take a look at zipfile and try to write a compressed file.
Bonus🧑🏫: Use pathlib instead of open
.
def write_points(file, points):
for p in points:
file.write(f"{p.x} {p.y}\n")
with open("points.dump", "w") as file:
write_points(file, point_objects)
from zipfile import ZipFile
points_string = "\n".join([f"{p.x} {p.y}" for p in point_objects])
with ZipFile("points.zip", "w") as zipfile:
zipfile.writestr("/points.dump", points_string)
from pathlib import Path
with (Path(".")/"points.dump").open("w") as file:
write_points(file, point_objects)
pickle
¶
https://docs.python.org/3/library/pickle.html
pickle
is Python’s binary serializer. It is straightforward to use, but files produced py pickle
are generally not human-readable.
import pickle
points = [
Point2D(x=4, y=5),
Point2D(x=2, y=7),
Point2D(x=1, y=-5),
Point2D(x=2, y=1)
]
# binary format instead of text-based format
pickle.dumps(points)
b'\x80\x04\x95E\x00\x00\x00\x00\x00\x00\x00]\x94(\x8c\x08__main__\x94\x8c\x07Point2D\x94\x93\x94K\x04K\x05\x86\x94\x81\x94h\x03K\x02K\x07\x86\x94\x81\x94h\x03K\x01J\xfb\xff\xff\xff\x86\x94\x81\x94h\x03K\x02K\x01\x86\x94\x81\x94e.'
with open("points.pickle", 'wb') as file:
# dump takes a file-like object
pickle.dump(points, file)
with open("points.pickle", 'rb') as file:
loaded_points = pickle.load(file)
print(loaded_points)
[Point2D(x=4, y=5), Point2D(x=2, y=7), Point2D(x=1, y=-5), Point2D(x=2, y=1)]
Bonus Assignment SL4 🧑🏫¶
dict
im Sekundärspeicher🧑🏫 Schreiben Sie eine KlasseSerializeDict
, die sich nach außen wie eindict
verhält, aber alle Werte mithilfe vonpickle
im Sekundärspeicher (Datei auf Festplatte/SSD) ablegt.
MinHeap Without Duplicates¶
This version of the heap disallows duplicates, instead adjusting priority if the object is already contained in the heap. This is not possible when using heapq
. Disallowing duplicates allows deleting objects from the heap, but forces the programmer to store the key separately. Using a class helps keeping the state of _buffer
and _index
consistent.
from collections import namedtuple
_Entry = namedtuple('_Entry', ['priority', 'value'])
# do not inherit from MinHeap -> liskov substitution principle
class MinHeapWithSeparatePriority:
def __init__(self, iterable=None):
self._buffer = []
self._index = {}
if iterable is None:
return
for value, priority in iterable:
self.add(priority, value)
def add(self, priority, value):
try:
existing_index = self._index[value]
except KeyError:
self._buffer.append(_Entry(priority, value))
last_index = len(self._buffer) - 1
self._index[value] = last_index
self._sift_up(last_index)
else:
old_prio = self._buffer[existing_index].priority
self._buffer[existing_index] = _Entry(priority, value)
if old_prio < priority:
self._sift_down(existing_index)
else:
self._sift_up(existing_index)
def extract_min(self):
if not self._buffer:
raise IndexError("Extract from empty heap")
minimum = self._buffer[0].value
self._remove(0)
return minimum
def remove(self, value):
try:
existing_index = self._index[value]
except KeyError:
pass
else:
self._remove(existing_index)
def _remove(self, index):
value_at_index = self._buffer[index].value
self._swap(index, -1)
self._buffer.pop()
del self._index[value_at_index]
self._sift_down(index)
def _sift_up(self, idx):
while idx > 0:
parent = (idx - 1) // 2
if not (self._buffer[idx].priority < self._buffer[parent].priority):
break
self._swap(idx, parent)
idx = parent
def _sift_down(self, idx):
while True:
left_child = idx * 2 + 1
if left_child >= len(self._buffer):
break
smaller_child = left_child
right_child = left_child + 1
if right_child < len(self._buffer):
if self._buffer[right_child].priority < self._buffer[left_child].priority:
smaller_child = right_child
if self._buffer[smaller_child].priority < self._buffer[idx].priority:
self._swap(idx, smaller_child)
else:
break
def _swap(self, i, j):
self._buffer[i], self._buffer[j] = self._buffer[j], self._buffer[i]
self._index[self._buffer[i]] = i
self._index[self._buffer[j]] = j
# see iterators chapter
def __iter__(self):
# return _MinHeapWithSeparatePriorityIterator(self)
for entry in self._buffer:
yield entry.value
# len, bool, str, etc.
heap = MinHeapWithSeparatePriority()
heap.add(3, "b")
heap.add(5, "c")
heap.add(1, "a")
heap.add(0, "c")
print(heap.extract_min())
print(heap.extract_min())
print(heap.extract_min())
c
a
b
Assignment SL5¶
Veränderbarer 2D-Punkt Später benötigen wir eine Implementierung von
Point2D
, deren Koordinatenx
undy
nachträglich verändert werden können. Dazu können wir leider nichtnamedtuple
verwenden, sondern müssen eine eigene Klasse schreiben. Implementieren Sie dazu den Konstuktor sowie__str__
,__repr__
,__eq__
und__ne__
. An diesem Punkt sei auf das Moduldataclasses
verwiesen, das allerdings in diesem Kurs nicht thematisiert wird, die Verwendung ist also freigestellt.Distanz zu anderen Punkten Erweitern Sie Ihre Klasse
Point2D
um eine Funktiondist(p)
, die die euklidische Distanz zu einem anderen Punkt p bestimmt.
import math
class Point2DMutable:
__slots__ = '_x', '_y'
# no defaults!
def __init__(self, x, y):
# no redundant state
self._x = x
self._y = y
def __str__(self):
return f"{self._x}, {self._y}"
def __repr__(self):
return f"Point2DMutable({self._x}, {self._y})"
def __eq__(self, other):
return (self._x, self._y) == (other._x, other._y)
def __ne__(self, other):
return not self.__eq__(other)
def dist(self, p):
return math.dist([self._x, self._y], [p._y, p._y])
Assignment SL6 🧑🏫¶
Kompliziertes Dateiformat Schreiben Sie eine Funktion, die das unten beschriebene Dateiformat aus einem übergebenen file-like object ausliest, alle Bedingungen an die Werte überprüft und dann als Liste von
namedtuple
s zurückgibt. Im Fehlerfall soll eine geeignete Exception geworfen werden.Daten verarbeiten Lesen Sie das untenstehende Beispiel ein und berechnen Sie die Summe der Wahrscheinlichkeiten aller Datenzeilen.
Das Dateiformat besteht aus einer beliebigen Anzahl Zeilen. Eine solche Zeile ist entweder leer, ein Kommentar, oder eine Datenzeile.
Leere Zeilen dienen der Lesbarkeit sollen einfach ignoriert werden.
Kommentarzeilen beginnen mit einem #
und dienen der Dokumentation, diese sollen beim Einlesen ebenfalls ignoriert werden.
Datenzeilen enthalten drei durch Kommas getrennte Werte: Der erste Wert ist eine ganzzahlige, nichtnegative, innerhalb einer Datei eindeutige Nummerierung der Datenzeile. Der zweite Wert ist eine Wahrscheinlichkeitsangabe, also eine Fließkommazahl zwischen 0 und 1. Der dritte Wert soll Ja/Nein-Eigenschaften darstellen. Diese sind in einem String aus exakt 5 Zeichen kodiert. Erlaubte Zeichen sind “Y” und “N”.
Direkt vor und nach Kommas sowie am Zeilenanfang und -ende dürfen Leerzeichen stehen, damit die Spalten aneinander ausgerichtet werden können.
# id, probability, properties
0, 0.25, YYYNN
1, 0.1, NNNNY
# very probable
4, 0.6, YYYYY
from collections import namedtuple
Row = namedtuple("Row", ["id", "probability", "properties"])
class ParseError(Exception):
pass
def _parse_id(s):
try:
i = int(s)
except ValueError as e:
raise ParseError("ID must be numeric") from e
if i < 0:
raise ParseError("ID must be positive")
return i
def _parse_probability(s):
try:
f = float(s)
except ValueError as e:
raise ParseError("probability must be a valid number") from e
if f > 1 or f < 0:
raise ParseError("probability must be between 0 and 1")
return f
def _parse_properties(s):
if len(s) != 5:
raise ParseError("properties must be exactly 5 characters long") from e
for c in s:
if c not in "YN":
raise ParseError("properties can only consist of 'Y' and 'N'") from e
return s
def _parse_line(line):
entries = line.split(",")
if len(entries) < 3:
raise ParseError("Not enough values")
if len(entries) > 3:
raise ParseError("Too many values")
id_ = _parse_id(entries[0].strip())
probability = _parse_probability(entries[1].strip())
properties = _parse_properties(entries[2].strip())
return id_, probability, properties
def parse_file(file):
seen = set()
rows = []
for line in file:
line = line.strip()
if not line:
continue
if line[0] == '#':
continue
id_, probability, properties = _parse_line(line)
if id_ in seen:
raise ParseError(f"ID not unique: {id_}")
seen.add(id_)
rows.append(Row(id_, probability, properties))
return rows
rows = parse_file(io.StringIO("""\
# id, probability, properties
0, 0.25, YYYNN
1, 0.1, NNNNY
# very probable
4, 0.6, YYYYY
"""))
rows
[Row(id=0, probability=0.25, properties='YYYNN'),
Row(id=1, probability=0.1, properties='NNNNY'),
Row(id=4, probability=0.6, properties='YYYYY')]