|
|
from sumy.parsers.plaintext import PlaintextParser |
|
|
from sumy.nlp.tokenizers import Tokenizer |
|
|
from sumy.summarizers.lsa import LsaSummarizer |
|
|
|
|
|
|
|
|
from newspaper import Article |
|
|
import os |
|
|
import requests |
|
|
import gpt_2_simple as gpt2 |
|
|
import tensorflow as tf |
|
|
|
|
|
import nltk |
|
|
nltk.download('punkt') |
|
|
|
|
|
|
|
|
def extract_course_information(course_design_variables): |
|
|
course_data = {} |
|
|
|
|
|
|
|
|
url = course_design_variables["url"] |
|
|
article = Article(url) |
|
|
article.download() |
|
|
article.parse() |
|
|
|
|
|
|
|
|
course_title = article.title if article.title else "Title not found on the page" |
|
|
course_data['course_title'] = course_title |
|
|
|
|
|
|
|
|
course_description = article.text if article.text else "Description not found on the page" |
|
|
course_data['course_description'] = course_description |
|
|
|
|
|
|
|
|
authors = ', '.join(article.authors) if article.authors else "Authors not found" |
|
|
course_data['authors'] = authors |
|
|
|
|
|
|
|
|
publish_date = article.publish_date if article.publish_date else "Publish date not found" |
|
|
course_data['publish_date'] = publish_date |
|
|
|
|
|
|
|
|
keywords = ', '.join(article.keywords) if article.keywords else "Keywords not found" |
|
|
course_data['keywords'] = keywords |
|
|
|
|
|
return course_data, article |
|
|
|
|
|
|
|
|
course_url = "https://uwex.wisconsin.edu/sustainable-management/masters/" |
|
|
course_design_variables = {"url": course_url} |
|
|
|
|
|
|
|
|
course_data, article = extract_course_information(course_design_variables) |
|
|
|
|
|
if False: |
|
|
|
|
|
print("Course Title: ", course_data['course_title']) |
|
|
print("Course Description: ", course_data['course_description']) |
|
|
print("Authors: ", course_data['authors']) |
|
|
print("Publish Date: ", course_data['publish_date']) |
|
|
print("Keywords: ", course_data['keywords']) |
|
|
|
|
|
|
|
|
parser = PlaintextParser.from_string(course_data['course_description'], Tokenizer("english")) |
|
|
summarizer = LsaSummarizer() |
|
|
summary_sumy = summarizer(parser.document, 3) |
|
|
print("\nSumy Summary and remove the html content from this content :\n", summary_sumy) |