NoPaste

threads_bs

von TRex

SNIPPET_TEXT:
  1. import bs4
  2. import requests
  3. import re
  4. import sys
  5. from timeit import timeit
  6.  
  7. # HTML des "Netzwerk"-Unterforums
  8. dfde = requests.get("https://debianforum.de/forum/viewforum.php?f=30").text
  9.  
  10. def parse(html):
  11.     soup = bs4.BeautifulSoup(dfde, 'html.parser')
  12.     threads = soup.select("ul.topiclist li.row")
  13.     for thread in threads:
  14.         title = thread.select("a.topictitle")[0].text
  15.         date = thread.select("div.topic-poster")[0].text
  16.         date = re.search("[0-9][0-9.: ]{18}", date).group(0)
  17.         username = thread.select("div.topic-poster a")[0].text
  18.         answers = thread.find("dd", class_="posts").text
  19.  
  20.         yield ";".join([title, date, username, answers.split(" ")[0]])
  21.  
  22. if __name__ == "__main__":
  23.  
  24.     if "timeit" in sys.argv:
  25.         timing_results = timeit(lambda: parse(dfde))
  26.         print(timing_results)
  27.     else:
  28.         for thread in parse(dfde):
  29.             print(thread)

Quellcode

Hier kannst du den Code kopieren und ihn in deinen bevorzugten Editor einfügen. PASTEBIN_DOWNLOAD_SNIPPET_EXPLAIN