Spaces:
Build error
Build error
Rename app_url_probl# functions.py import os import io import re import requests from docx import Document from newspaper import Article from langdetect import detect from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from transformers import pipeline import nltk nltk.download('punkt') def extract_content_from_url(url): """ Extracts the title and text content from a given URL using the newspaper3k library. """ try: article = Article(url) article.download() article.parse() title = article.title text = article.text return {"title": title, "text": text} except Exception as e: print(f"Error extracting content from URL: {e}") return {"title": "", "text": ""} def summarize_text(text, num_sentences=5): """ Summarizes the given text using the LSA summarizer from the Sumy library. """ try: language = detect(text) except: language = 'english' parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer = LsaSummarizer() summary = summarizer(parser.document, num_sentences) summarized_text = ' '.join([str(sentence) for sentence in summary]) return summarized_text def clean_text(text): """ Cleans the text by removing unwanted characters and formatting. """ text = re.sub(r'\s+', ' ', text) text = text.strip() return text def generate_questions(summary, num_questions=3): """ Generates questions based on the summarized text using a question-generation pipeline. """ question_generator = pipeline('e2e-qg') questions = question_generator(summary, max_questions=num_questions) return questions def strip_md(text): """ Removes markdown formatting from the text. """ text = text.replace("**", "").replace("*", "").replace("#", "") text = re.sub(r'([!*_=~-])', r'\\\1', text) return text def create_document(): """ Creates a new Word document with a predefined heading. """ doc = Document() doc.add_heading("Business Proposal", 0) return doc def add_section_to_doc(doc, section_name, section_content): """ Adds a new section with the given name and content to the Word document. """ section_content = strip_md(section_content) section_content = section_content.replace("\\", "") # Remove backslashes doc.add_heading(section_name, level=1) doc.add_paragraph(section_content) return doc def get_docx_bytes(doc): """ Converts the Word document to bytes for downloading. """ doc_io = io.BytesIO() doc.save(doc_io) doc_io.seek(0) return doc_io to app_url_problem.py
472e45b
verified