SoDa12321 commited on
Commit
65e936e
·
verified ·
1 Parent(s): f716c58

Create functions.py

Browse files
Files changed (1) hide show
  1. functions.py +98 -0
functions.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # functions.py
2
+
3
+ import os
4
+ import io
5
+ import re
6
+ import requests
7
+ from docx import Document
8
+ from newspaper import Article
9
+ from langdetect import detect
10
+ from sumy.parsers.plaintext import PlaintextParser
11
+ from sumy.nlp.tokenizers import Tokenizer
12
+ from sumy.summarizers.lsa import LsaSummarizer
13
+ from transformers import pipeline
14
+ import nltk
15
+
16
+ nltk.download('punkt')
17
+
18
+ def extract_content_from_url(url):
19
+ """
20
+ Extracts the title and text content from a given URL using the newspaper3k library.
21
+ """
22
+ try:
23
+ article = Article(url)
24
+ article.download()
25
+ article.parse()
26
+ title = article.title
27
+ text = article.text
28
+ return {"title": title, "text": text}
29
+ except Exception as e:
30
+ print(f"Error extracting content from URL: {e}")
31
+ return {"title": "", "text": ""}
32
+
33
+ def summarize_text(text, num_sentences=5):
34
+ """
35
+ Summarizes the given text using the LSA summarizer from the Sumy library.
36
+ """
37
+ try:
38
+ language = detect(text)
39
+ except:
40
+ language = 'english'
41
+ parser = PlaintextParser.from_string(text, Tokenizer(language))
42
+ summarizer = LsaSummarizer()
43
+ summary = summarizer(parser.document, num_sentences)
44
+ summarized_text = ' '.join([str(sentence) for sentence in summary])
45
+ return summarized_text
46
+
47
+ def clean_text(text):
48
+ """
49
+ Cleans the text by removing unwanted characters and formatting.
50
+ """
51
+ text = re.sub(r'\s+', ' ', text)
52
+ text = text.strip()
53
+ return text
54
+
55
+ def generate_questions(summary, num_questions=3):
56
+ """
57
+ Generates questions based on the summarized text using a question-generation pipeline.
58
+ """
59
+ question_generator = pipeline('e2e-qg')
60
+ questions = question_generator(summary, max_questions=num_questions)
61
+ return questions
62
+
63
+ def strip_md(text):
64
+ """
65
+ Removes markdown formatting from the text.
66
+ """
67
+ text = text.replace("**", "").replace("*", "").replace("#", "")
68
+ text = re.sub(r'([!*_=~-])', r'\\\1', text)
69
+ return text
70
+
71
+ def create_document():
72
+ """
73
+ Creates a new Word document with a predefined heading.
74
+ """
75
+ doc = Document()
76
+ doc.add_heading("Business Proposal", 0)
77
+ return doc
78
+
79
+ def add_section_to_doc(doc, section_name, section_content):
80
+ """
81
+ Adds a new section with the given name and content to the Word document.
82
+ """
83
+ section_content = strip_md(section_content)
84
+ section_content = section_content.replace("\\", "") # Remove backslashes
85
+ doc.add_heading(section_name, level=1)
86
+ doc.add_paragraph(section_content)
87
+ return doc
88
+
89
+ def get_docx_bytes(doc):
90
+ """
91
+ Converts the Word document to bytes for downloading.
92
+ """
93
+ doc_io = io.BytesIO()
94
+ doc.save(doc_io)
95
+ doc_io.seek(0)
96
+ return doc_io
97
+
98
+