izhan001 commited on
Commit
ddb93bd
·
verified ·
1 Parent(s): e2e76a0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import docx
3
+ import PyPDF2
4
+ from pptx import Presentation
5
+ from transformers import pipeline
6
+ from docx import Document
7
+ from io import BytesIO
8
+ import tempfile
9
+
10
+ # Initialize Hugging Face models for summarization, rephrasing, and sentiment analysis
11
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Specify the model
12
+ rephraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws", max_length=512, truncation=True)
13
+ sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
14
+
15
+ # Function to read content from different file types
16
+ def read_file(file, file_type):
17
+ content = ""
18
+ if file_type == "docx":
19
+ doc = Document(file)
20
+ for para in doc.paragraphs:
21
+ content += para.text + "\n"
22
+ elif file_type == "txt":
23
+ content = file.decode("utf-8")
24
+ elif file_type == "pdf":
25
+ pdf_reader = PyPDF2.PdfReader(file)
26
+ for page in pdf_reader.pages:
27
+ content += page.extract_text() + "\n"
28
+ elif file_type == "pptx":
29
+ prs = Presentation(file)
30
+ for slide in prs.slides:
31
+ for shape in slide.shapes:
32
+ if hasattr(shape, "text"):
33
+ content += shape.text + "\n"
34
+ return content
35
+
36
+ # Function to process the file and generate outputs
37
+ def process_file(file, file_type, language="en"):
38
+ content = read_file(file, file_type)
39
+
40
+ # Check if content is not empty
41
+ if not content.strip():
42
+ return "Error: The document is empty or unsupported format.", None, None, None, None, None
43
+
44
+ # Summarize the content
45
+ try:
46
+ summary = summarizer(content, max_length=150, min_length=50, do_sample=False)
47
+ summary_text = summary[0]['summary_text']
48
+ except Exception as e:
49
+ summary_text = f"Summary Error: {str(e)}"
50
+
51
+ # Rephrase the entire content in manageable chunks
52
+ rephrased_text = ""
53
+ try:
54
+ chunk_size = 500 # Adjust this size based on model and resource limits
55
+ content_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
56
+ for chunk in content_chunks:
57
+ rephrased = rephraser(chunk)
58
+ rephrased_text += rephrased[0]['generated_text'] + " "
59
+ except Exception as e:
60
+ rephrased_text = f"Rephrase Error: {str(e)}"
61
+
62
+ # Sentiment analysis
63
+ try:
64
+ sentiment = sentiment_analyzer(content[:512]) # Limiting to 512 tokens for sentiment analysis
65
+ sentiment_text = sentiment[0]['label']
66
+ except Exception as e:
67
+ sentiment_text = f"Sentiment Analysis Error: {str(e)}"
68
+
69
+ # Extract keywords (for simplicity, extracting words here, but you can replace this with a better method)
70
+ keywords = ' '.join([word for word in content.split()[:10]]) # Sample, first 10 words as keywords
71
+
72
+ # Saving processed file (for download link)
73
+ try:
74
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as temp_file:
75
+ temp_file.write(content.encode('utf-8'))
76
+ processed_file_path = temp_file.name
77
+ except Exception as e:
78
+ processed_file_path = f"Error saving processed document: {str(e)}"
79
+
80
+ return content, rephrased_text.strip(), summary_text, sentiment_text, keywords, processed_file_path
81
+
82
+ # Set up Gradio interface
83
+ iface = gr.Interface(
84
+ fn=process_file,
85
+ inputs=[
86
+ gr.File(label="Upload Document (PDF, DOCX, TXT, PPTX)"),
87
+ gr.Dropdown(["pdf", "docx", "txt", "pptx"], label="File Type"),
88
+ ],
89
+ outputs=[
90
+ gr.Textbox(label="Original Content"),
91
+ gr.Textbox(label="Rephrased Content"),
92
+ gr.Textbox(label="Summary"),
93
+ gr.Textbox(label="Sentiment Analysis"),
94
+ gr.Textbox(label="Keywords"),
95
+ gr.File(label="Download Processed Document")
96
+ ],
97
+ title="Enhanced Document Processor",
98
+ description="Upload a document to rephrase, summarize, analyze sentiment, extract keywords, and highlight key information. Supports PDF, DOCX, TXT, PPTX."
99
+ )
100
+
101
+ iface.launch()