Skip to content Skip to sidebar Skip to footer

How To Match And Remove Wikipedia Refences With Python And Re

from bs4 import BeautifulSoup import requests import time import keyboard import re def searchWiki(): search = input('What do you want to search for? ').replace(' ', '_').repl

Solution 1:

You can do it using regular expressions.

For example with your p var:

import re

line = p.text.strip()
new_line = re.sub("\[[0-9]+\]", '', line)
print(new_line)

Solution 2:

All of the footnotes are under the class reference, you can remove them using the decompose() method:

fortagin soup.find_all(class_="reference"):
    tag.decompose()

import requests
from bs4 import BeautifulSoup


defsearchWiki():
    search = input("What do you want to search for? ").replace(" ", "_").replace("'", "%27")
    url = f"https://en.wikipedia.org/wiki/{search}"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    # Remove all footnotes under the `reference` classfor tag in soup.find_all(class_="reference"):
        tag.decompose()

    title = soup.find("title").get_text()
    info = soup.select("p")
    print("Press enter to read the next paragraph")
    print(title)
    print(url)
    for p in info:
        print(p.text.strip())


searchWiki()

Post a Comment for "How To Match And Remove Wikipedia Refences With Python And Re"