Bussiness-plan-make-3 / functions.py
SoDa12321's picture
Create functions.py
65e936e verified
# functions.py
import os
import io
import re
import requests
from docx import Document
from newspaper import Article
from langdetect import detect
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from transformers import pipeline
import nltk
nltk.download('punkt')
def extract_content_from_url(url):
"""
Extracts the title and text content from a given URL using the newspaper3k library.
"""
try:
article = Article(url)
article.download()
article.parse()
title = article.title
text = article.text
return {"title": title, "text": text}
except Exception as e:
print(f"Error extracting content from URL: {e}")
return {"title": "", "text": ""}
def summarize_text(text, num_sentences=5):
"""
Summarizes the given text using the LSA summarizer from the Sumy library.
"""
try:
language = detect(text)
except:
language = 'english'
parser = PlaintextParser.from_string(text, Tokenizer(language))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, num_sentences)
summarized_text = ' '.join([str(sentence) for sentence in summary])
return summarized_text
def clean_text(text):
"""
Cleans the text by removing unwanted characters and formatting.
"""
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
def generate_questions(summary, num_questions=3):
"""
Generates questions based on the summarized text using a question-generation pipeline.
"""
question_generator = pipeline('e2e-qg')
questions = question_generator(summary, max_questions=num_questions)
return questions
def strip_md(text):
"""
Removes markdown formatting from the text.
"""
text = text.replace("**", "").replace("*", "").replace("#", "")
text = re.sub(r'([!*_=~-])', r'\\\1', text)
return text
def create_document():
"""
Creates a new Word document with a predefined heading.
"""
doc = Document()
doc.add_heading("Business Proposal", 0)
return doc
def add_section_to_doc(doc, section_name, section_content):
"""
Adds a new section with the given name and content to the Word document.
"""
section_content = strip_md(section_content)
section_content = section_content.replace("\\", "") # Remove backslashes
doc.add_heading(section_name, level=1)
doc.add_paragraph(section_content)
return doc
def get_docx_bytes(doc):
"""
Converts the Word document to bytes for downloading.
"""
doc_io = io.BytesIO()
doc.save(doc_io)
doc_io.seek(0)
return doc_io