File size: 2,739 Bytes
65e936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# functions.py

import os
import io
import re
import requests
from docx import Document
from newspaper import Article
from langdetect import detect
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from transformers import pipeline
import nltk

nltk.download('punkt')

def extract_content_from_url(url):
    """
    Extracts the title and text content from a given URL using the newspaper3k library.
    """
    try:
        article = Article(url)
        article.download()
        article.parse()
        title = article.title
        text = article.text
        return {"title": title, "text": text}
    except Exception as e:
        print(f"Error extracting content from URL: {e}")
        return {"title": "", "text": ""}

def summarize_text(text, num_sentences=5):
    """
    Summarizes the given text using the LSA summarizer from the Sumy library.
    """
    try:
        language = detect(text)
    except:
        language = 'english'
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    summarized_text = ' '.join([str(sentence) for sentence in summary])
    return summarized_text

def clean_text(text):
    """
    Cleans the text by removing unwanted characters and formatting.
    """
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def generate_questions(summary, num_questions=3):
    """
    Generates questions based on the summarized text using a question-generation pipeline.
    """
    question_generator = pipeline('e2e-qg')
    questions = question_generator(summary, max_questions=num_questions)
    return questions

def strip_md(text):
    """
    Removes markdown formatting from the text.
    """
    text = text.replace("**", "").replace("*", "").replace("#", "")
    text = re.sub(r'([!*_=~-])', r'\\\1', text)
    return text

def create_document():
    """
    Creates a new Word document with a predefined heading.
    """
    doc = Document()
    doc.add_heading("Business Proposal", 0)
    return doc

def add_section_to_doc(doc, section_name, section_content):
    """
    Adds a new section with the given name and content to the Word document.
    """
    section_content = strip_md(section_content)
    section_content = section_content.replace("\\", "")  # Remove backslashes
    doc.add_heading(section_name, level=1)
    doc.add_paragraph(section_content)
    return doc

def get_docx_bytes(doc):
    """
    Converts the Word document to bytes for downloading.
    """
    doc_io = io.BytesIO()
    doc.save(doc_io)
    doc_io.seek(0)
    return doc_io