SoDa12321 commited on
Commit
a8998e7
·
verified ·
1 Parent(s): f15991c

Update Function_Sumrerize_URL_Read.py

Browse files
Files changed (1) hide show
  1. Function_Sumrerize_URL_Read.py +33 -11
Function_Sumrerize_URL_Read.py CHANGED
@@ -1,15 +1,18 @@
 
 
 
 
 
1
  from newspaper import Article
2
  import os
3
  import requests
4
  import gpt_2_simple as gpt2
5
  import tensorflow as tf
6
 
 
 
7
 
8
 
9
- # Example Usage
10
- course_url = "https://uwex.wisconsin.edu/sustainable-management/masters/" # @param {type:"string"} # Wrap the URL in quotes
11
- response = requests.get(course_url)
12
-
13
  def extract_course_information(course_design_variables):
14
  course_data = {}
15
 
@@ -27,18 +30,37 @@ def extract_course_information(course_design_variables):
27
  course_description = article.text if article.text else "Description not found on the page"
28
  course_data['course_description'] = course_description
29
 
30
- return course_data
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # Example Usage
33
  course_url = "https://uwex.wisconsin.edu/sustainable-management/masters/"
34
  course_design_variables = {"url": course_url}
35
 
36
  # Extract course information
37
- course_data = extract_course_information(course_design_variables)
38
-
39
- # Print the extracted information
40
- print("Course Title: ", course_data['course_title'])
41
- print("Course Description: ", course_data['course_description'])
42
 
43
- # Download the model if not already present
 
 
 
 
 
 
44
 
 
 
 
 
 
 
1
+ from sumy.parsers.plaintext import PlaintextParser
2
+ from sumy.nlp.tokenizers import Tokenizer
3
+ from sumy.summarizers.lsa import LsaSummarizer
4
+
5
+
6
  from newspaper import Article
7
  import os
8
  import requests
9
  import gpt_2_simple as gpt2
10
  import tensorflow as tf
11
 
12
+ import nltk
13
+ nltk.download('punkt')
14
 
15
 
 
 
 
 
16
  def extract_course_information(course_design_variables):
17
  course_data = {}
18
 
 
30
  course_description = article.text if article.text else "Description not found on the page"
31
  course_data['course_description'] = course_description
32
 
33
+ # Extract authors
34
+ authors = ', '.join(article.authors) if article.authors else "Authors not found"
35
+ course_data['authors'] = authors
36
+
37
+ # Extract publish date
38
+ publish_date = article.publish_date if article.publish_date else "Publish date not found"
39
+ course_data['publish_date'] = publish_date
40
+
41
+ # Extract keywords
42
+ keywords = ', '.join(article.keywords) if article.keywords else "Keywords not found"
43
+ course_data['keywords'] = keywords
44
+
45
+ return course_data, article
46
 
47
  # Example Usage
48
  course_url = "https://uwex.wisconsin.edu/sustainable-management/masters/"
49
  course_design_variables = {"url": course_url}
50
 
51
  # Extract course information
52
+ course_data, article = extract_course_information(course_design_variables)
 
 
 
 
53
 
54
+ if False:
55
+ # Print the extracted information
56
+ print("Course Title: ", course_data['course_title'])
57
+ print("Course Description: ", course_data['course_description'])
58
+ print("Authors: ", course_data['authors'])
59
+ print("Publish Date: ", course_data['publish_date'])
60
+ print("Keywords: ", course_data['keywords'])
61
 
62
+ # Generate text with Sumy
63
+ parser = PlaintextParser.from_string(course_data['course_description'], Tokenizer("english"))
64
+ summarizer = LsaSummarizer()
65
+ summary_sumy = summarizer(parser.document, 3)
66
+ print("\nSumy Summary and remove the html content from this content :\n", summary_sumy)