AnandRao
/

WebScraping-BeautifulSoup.py

Model card Files Files and versions

xet

Community

AnandRao commited on Dec 28, 2023

Commit

3d19fad

1 Parent(s): ba1af9b

Upload WebScraping-BeautifulSoup.py with huggingface_hub

Browse files

Files changed (1) hide show

WebScraping-BeautifulSoup.py +82 -0

WebScraping-BeautifulSoup.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from bs4 import BeautifulSoup
+import requests
+import re
+from docx import Document  # Import the Document class from the python-docx library
+from docx.oxml import OxmlElement  # Import OxmlElement for creating XML elements
+from docx.oxml.ns import qn  # Import qn for namespacing
+# Function to add a hyperlink to a paragraph
+def add_hyperlink(paragraph, url, text):
+    """
+    A function that places a hyperlink within a paragraph object.
+    :param paragraph: The Paragraph to which the hyperlink will be added.
+    :param url: The URL of the hyperlink.
+    :param text: The text displayed for the hyperlink.
+    :return: The hyperlink object
+    """
+    # This gets access to the document.xml.rels file and gets a new relation id value
+    part = paragraph.part
+    r_id = part.relate_to(url, "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", is_external=True)
+    # Create the w:hyperlink tag and add needed values
+    hyperlink = OxmlElement('w:hyperlink')
+    hyperlink.set(qn('r:id'), r_id, )
+    # Create a w:r element
+    new_run = OxmlElement('w:r')
+    # Create a new w:rPr element
+    rPr = OxmlElement('w:rPr')
+    # Add color if you like
+    c = OxmlElement('w:color')
+    c.set(qn('w:val'), '0000EE')  # set the color to blue
+    rPr.append(c)
+    # Remove underlining if you like
+    u = OxmlElement('w:u')
+    u.set(qn('w:val'), 'single')
+    rPr.append(u)
+    new_run.append(rPr)
+    new_run.text = text
+    hyperlink.append(new_run)
+    paragraph._p.append(hyperlink)
+    return hyperlink
+# function to extract html document from given url
+def getHTMLdocument(url):
+    # request for HTML document of given url
+    response = requests.get(url)
+    # response will be provided in JSON format
+    return response.text
+# assign URL
+url_to_scrape = "https://www.coursera.org/"
+# create document
+html_document = getHTMLdocument(url_to_scrape)
+# create soap object
+soup = BeautifulSoup(html_document, 'html.parser')
+# Create a new Document object
+doc = Document()
+# find all the anchor tags with "href"
+# attribute starting with "https://"
+links = soup.find_all('a', attrs={'href': re.compile("^https://")})
+for link in links:
+    # Extract the href (url) from the anchor tag
+    url = link.get('href')
+    # Use the add_hyperlink function to add the URL as a clickable link to the document
+    p = doc.add_paragraph()
+    add_hyperlink(p, url, url)
+# Save the document with a given name
+doc.save('scraped_links.docx')