Bussiness-plan-17-Question-Answerer-2 / Function_Sumrerize_URL_Read.py
SoDa12321's picture
Update Function_Sumrerize_URL_Read.py
a8998e7 verified
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from newspaper import Article
import os
import requests
import gpt_2_simple as gpt2
import tensorflow as tf
import nltk
nltk.download('punkt')
def extract_course_information(course_design_variables):
course_data = {}
# Retrieve course information from the web
url = course_design_variables["url"]
article = Article(url)
article.download()
article.parse()
# Extract course title
course_title = article.title if article.title else "Title not found on the page"
course_data['course_title'] = course_title
# Extract course description
course_description = article.text if article.text else "Description not found on the page"
course_data['course_description'] = course_description
# Extract authors
authors = ', '.join(article.authors) if article.authors else "Authors not found"
course_data['authors'] = authors
# Extract publish date
publish_date = article.publish_date if article.publish_date else "Publish date not found"
course_data['publish_date'] = publish_date
# Extract keywords
keywords = ', '.join(article.keywords) if article.keywords else "Keywords not found"
course_data['keywords'] = keywords
return course_data, article
# Example Usage
course_url = "https://uwex.wisconsin.edu/sustainable-management/masters/"
course_design_variables = {"url": course_url}
# Extract course information
course_data, article = extract_course_information(course_design_variables)
if False:
# Print the extracted information
print("Course Title: ", course_data['course_title'])
print("Course Description: ", course_data['course_description'])
print("Authors: ", course_data['authors'])
print("Publish Date: ", course_data['publish_date'])
print("Keywords: ", course_data['keywords'])
# Generate text with Sumy
parser = PlaintextParser.from_string(course_data['course_description'], Tokenizer("english"))
summarizer = LsaSummarizer()
summary_sumy = summarizer(parser.document, 3)
print("\nSumy Summary and remove the html content from this content :\n", summary_sumy)