AnandRao commited on
Commit
3d19fad
·
1 Parent(s): ba1af9b

Upload WebScraping-BeautifulSoup.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. WebScraping-BeautifulSoup.py +82 -0
WebScraping-BeautifulSoup.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ import re
4
+ from docx import Document # Import the Document class from the python-docx library
5
+ from docx.oxml import OxmlElement # Import OxmlElement for creating XML elements
6
+ from docx.oxml.ns import qn # Import qn for namespacing
7
+
8
+ # Function to add a hyperlink to a paragraph
9
+ def add_hyperlink(paragraph, url, text):
10
+ """
11
+ A function that places a hyperlink within a paragraph object.
12
+
13
+ :param paragraph: The Paragraph to which the hyperlink will be added.
14
+ :param url: The URL of the hyperlink.
15
+ :param text: The text displayed for the hyperlink.
16
+ :return: The hyperlink object
17
+ """
18
+ # This gets access to the document.xml.rels file and gets a new relation id value
19
+ part = paragraph.part
20
+ r_id = part.relate_to(url, "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", is_external=True)
21
+
22
+ # Create the w:hyperlink tag and add needed values
23
+ hyperlink = OxmlElement('w:hyperlink')
24
+ hyperlink.set(qn('r:id'), r_id, )
25
+
26
+ # Create a w:r element
27
+ new_run = OxmlElement('w:r')
28
+
29
+ # Create a new w:rPr element
30
+ rPr = OxmlElement('w:rPr')
31
+
32
+ # Add color if you like
33
+ c = OxmlElement('w:color')
34
+ c.set(qn('w:val'), '0000EE') # set the color to blue
35
+ rPr.append(c)
36
+
37
+ # Remove underlining if you like
38
+ u = OxmlElement('w:u')
39
+ u.set(qn('w:val'), 'single')
40
+ rPr.append(u)
41
+
42
+ new_run.append(rPr)
43
+ new_run.text = text
44
+ hyperlink.append(new_run)
45
+
46
+ paragraph._p.append(hyperlink)
47
+
48
+ return hyperlink
49
+
50
+ # function to extract html document from given url
51
+ def getHTMLdocument(url):
52
+ # request for HTML document of given url
53
+ response = requests.get(url)
54
+ # response will be provided in JSON format
55
+ return response.text
56
+
57
+ # assign URL
58
+ url_to_scrape = "https://www.coursera.org/"
59
+
60
+ # create document
61
+ html_document = getHTMLdocument(url_to_scrape)
62
+
63
+ # create soap object
64
+ soup = BeautifulSoup(html_document, 'html.parser')
65
+
66
+ # Create a new Document object
67
+ doc = Document()
68
+
69
+ # find all the anchor tags with "href"
70
+ # attribute starting with "https://"
71
+ links = soup.find_all('a', attrs={'href': re.compile("^https://")})
72
+
73
+ for link in links:
74
+ # Extract the href (url) from the anchor tag
75
+ url = link.get('href')
76
+ # Use the add_hyperlink function to add the URL as a clickable link to the document
77
+ p = doc.add_paragraph()
78
+ add_hyperlink(p, url, url)
79
+
80
+ # Save the document with a given name
81
+ doc.save('scraped_links.docx')
82
+