Spaces:
Build error
Build error
| # functions.py | |
| import os | |
| import io | |
| import re | |
| import requests | |
| from docx import Document | |
| from newspaper import Article | |
| from langdetect import detect | |
| from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.lsa import LsaSummarizer | |
| from transformers import pipeline | |
| import nltk | |
| nltk.download('punkt') | |
| def extract_content_from_url(url): | |
| """ | |
| Extracts the title and text content from a given URL using the newspaper3k library. | |
| """ | |
| try: | |
| article = Article(url) | |
| article.download() | |
| article.parse() | |
| title = article.title | |
| text = article.text | |
| return {"title": title, "text": text} | |
| except Exception as e: | |
| print(f"Error extracting content from URL: {e}") | |
| return {"title": "", "text": ""} | |
| def summarize_text(text, num_sentences=5): | |
| """ | |
| Summarizes the given text using the LSA summarizer from the Sumy library. | |
| """ | |
| try: | |
| language = detect(text) | |
| except: | |
| language = 'english' | |
| parser = PlaintextParser.from_string(text, Tokenizer(language)) | |
| summarizer = LsaSummarizer() | |
| summary = summarizer(parser.document, num_sentences) | |
| summarized_text = ' '.join([str(sentence) for sentence in summary]) | |
| return summarized_text | |
| def clean_text(text): | |
| """ | |
| Cleans the text by removing unwanted characters and formatting. | |
| """ | |
| text = re.sub(r'\s+', ' ', text) | |
| text = text.strip() | |
| return text | |
| def generate_questions(summary, num_questions=3): | |
| """ | |
| Generates questions based on the summarized text using a question-generation pipeline. | |
| """ | |
| question_generator = pipeline('e2e-qg') | |
| questions = question_generator(summary, max_questions=num_questions) | |
| return questions | |
| def strip_md(text): | |
| """ | |
| Removes markdown formatting from the text. | |
| """ | |
| text = text.replace("**", "").replace("*", "").replace("#", "") | |
| text = re.sub(r'([!*_=~-])', r'\\\1', text) | |
| return text | |
| def create_document(): | |
| """ | |
| Creates a new Word document with a predefined heading. | |
| """ | |
| doc = Document() | |
| doc.add_heading("Business Proposal", 0) | |
| return doc | |
| def add_section_to_doc(doc, section_name, section_content): | |
| """ | |
| Adds a new section with the given name and content to the Word document. | |
| """ | |
| section_content = strip_md(section_content) | |
| section_content = section_content.replace("\\", "") # Remove backslashes | |
| doc.add_heading(section_name, level=1) | |
| doc.add_paragraph(section_content) | |
| return doc | |
| def get_docx_bytes(doc): | |
| """ | |
| Converts the Word document to bytes for downloading. | |
| """ | |
| doc_io = io.BytesIO() | |
| doc.save(doc_io) | |
| doc_io.seek(0) | |
| return doc_io | |