import requests
import re
import sys
from timeit import timeit
# HTML des "Netzwerk"-Unterforums
dfde = requests.get("https://debianforum.de/forum/viewforum.php?f=30").text
def parse(html):
# (?:.*?) sind non-capturing groups mit soviel wie nötig matchenden Zeichen (non-greedy)
threadex = re.compile(r"
)(?P[^<]+)(?:.*?)"
r"(?:username.>)(?P[^<]+)(?: » )"
r"(?:.*?)"
r"(?P[0-9][0-9.,: ]{18})"
r"(?:.*?)"
r"(?:posts.>)(?P[0-9]+)",
re.S)
for thread in threadex.finditer(dfde):
yield "{topictitle};{date};{username};{posts}".format(**thread.groupdict())
if __name__ == "__main__":
if "timeit" in sys.argv:
timing_results = timeit(lambda: parse(dfde))
print(timing_results)
else:
for thread in parse(dfde):
print(thread)