How To Match And Remove Wikipedia Refences With Python And Re
from bs4 import BeautifulSoup import requests import time import keyboard import re def searchWiki(): search = input('What do you want to search for? ').replace(' ', '_').repl
Solution 1:
You can do it using regular expressions.
For example with your p
var:
import re
line = p.text.strip()
new_line = re.sub("\[[0-9]+\]", '', line)
print(new_line)
Solution 2:
All of the footnotes are under the class reference
, you can remove them using the decompose()
method:
fortagin soup.find_all(class_="reference"):
tag.decompose()
import requests
from bs4 import BeautifulSoup
defsearchWiki():
search = input("What do you want to search for? ").replace(" ", "_").replace("'", "%27")
url = f"https://en.wikipedia.org/wiki/{search}"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
# Remove all footnotes under the `reference` classfor tag in soup.find_all(class_="reference"):
tag.decompose()
title = soup.find("title").get_text()
info = soup.select("p")
print("Press enter to read the next paragraph")
print(title)
print(url)
for p in info:
print(p.text.strip())
searchWiki()
Post a Comment for "How To Match And Remove Wikipedia Refences With Python And Re"