from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from newspaper import Article import os import requests import gpt_2_simple as gpt2 import tensorflow as tf import nltk nltk.download('punkt') def extract_course_information(course_design_variables): course_data = {} # Retrieve course information from the web url = course_design_variables["url"] article = Article(url) article.download() article.parse() # Extract course title course_title = article.title if article.title else "Title not found on the page" course_data['course_title'] = course_title # Extract course description course_description = article.text if article.text else "Description not found on the page" course_data['course_description'] = course_description # Extract authors authors = ', '.join(article.authors) if article.authors else "Authors not found" course_data['authors'] = authors # Extract publish date publish_date = article.publish_date if article.publish_date else "Publish date not found" course_data['publish_date'] = publish_date # Extract keywords keywords = ', '.join(article.keywords) if article.keywords else "Keywords not found" course_data['keywords'] = keywords return course_data, article # Example Usage course_url = "https://uwex.wisconsin.edu/sustainable-management/masters/" course_design_variables = {"url": course_url} # Extract course information course_data, article = extract_course_information(course_design_variables) if False: # Print the extracted information print("Course Title: ", course_data['course_title']) print("Course Description: ", course_data['course_description']) print("Authors: ", course_data['authors']) print("Publish Date: ", course_data['publish_date']) print("Keywords: ", course_data['keywords']) # Generate text with Sumy parser = PlaintextParser.from_string(course_data['course_description'], Tokenizer("english")) summarizer = LsaSummarizer() summary_sumy = summarizer(parser.document, 3) print("\nSumy Summary and remove the html content from this content :\n", summary_sumy)