threads_regex
von TRex- SNIPPET_TEXT:
-
- import requests
- import re
- import sys
- from timeit import timeit
- # HTML des "Netzwerk"-Unterforums
- dfde = requests.get("https://debianforum.de/forum/viewforum.php?f=30").text
- def parse(html):
- # (?:.*?) sind non-capturing groups mit soviel wie nötig matchenden Zeichen (non-greedy)
- threadex = re.compile(r"<li class=.row.bg"
- r"(?:.*?)"
- r"(?:class=.topictitle.>)(?P<topictitle>[^<]+)(?:.*?)"
- r"(?:username.>)(?P<username>[^<]+)(?:</a> » )"
- r"(?:.*?)"
- r"(?P<date>[0-9][0-9.,: ]{18})"
- r"(?:.*?)"
- r"(?:posts.>)(?P<posts>[0-9]+)",
- re.S)
- for thread in threadex.finditer(dfde):
- yield "{topictitle};{date};{username};{posts}".format(**thread.groupdict())
- if __name__ == "__main__":
- if "timeit" in sys.argv:
- timing_results = timeit(lambda: parse(dfde))
- print(timing_results)
- else:
- for thread in parse(dfde):
- print(thread)
Quellcode
Hier kannst du den Code kopieren und ihn in deinen bevorzugten Editor einfügen. PASTEBIN_DOWNLOAD_SNIPPET_EXPLAIN