import requests import re import sys from timeit import timeit # HTML des "Netzwerk"-Unterforums dfde = requests.get("https://debianforum.de/forum/viewforum.php?f=30").text def parse(html): # (?:.*?) sind non-capturing groups mit soviel wie nötig matchenden Zeichen (non-greedy) threadex = re.compile(r"
  • )(?P[^<]+)(?:.*?)" r"(?:username.>)(?P[^<]+)(?: » )" r"(?:.*?)" r"(?P[0-9][0-9.,: ]{18})" r"(?:.*?)" r"(?:posts.>)(?P[0-9]+)", re.S) for thread in threadex.finditer(dfde): yield "{topictitle};{date};{username};{posts}".format(**thread.groupdict()) if __name__ == "__main__": if "timeit" in sys.argv: timing_results = timeit(lambda: parse(dfde)) print(timing_results) else: for thread in parse(dfde): print(thread)