Spaces:

izhan001
/

Smart-Doc-Processor

Build error

App Files Files Community

izhan001 commited on Nov 7, 2024

Commit

ddb93bd

verified ·

1 Parent(s): e2e76a0

Create app.py

Browse files

Files changed (1) hide show

app.py +101 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+import docx
+import PyPDF2
+from pptx import Presentation
+from transformers import pipeline
+from docx import Document
+from io import BytesIO
+import tempfile
+# Initialize Hugging Face models for summarization, rephrasing, and sentiment analysis
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")  # Specify the model
+rephraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws", max_length=512, truncation=True)
+sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+# Function to read content from different file types
+def read_file(file, file_type):
+    content = ""
+    if file_type == "docx":
+        doc = Document(file)
+        for para in doc.paragraphs:
+            content += para.text + "\n"
+    elif file_type == "txt":
+        content = file.decode("utf-8")
+    elif file_type == "pdf":
+        pdf_reader = PyPDF2.PdfReader(file)
+        for page in pdf_reader.pages:
+            content += page.extract_text() + "\n"
+    elif file_type == "pptx":
+        prs = Presentation(file)
+        for slide in prs.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    content += shape.text + "\n"
+    return content
+# Function to process the file and generate outputs
+def process_file(file, file_type, language="en"):
+    content = read_file(file, file_type)
+    # Check if content is not empty
+    if not content.strip():
+        return "Error: The document is empty or unsupported format.", None, None, None, None, None
+    # Summarize the content
+    try:
+        summary = summarizer(content, max_length=150, min_length=50, do_sample=False)
+        summary_text = summary[0]['summary_text']
+    except Exception as e:
+        summary_text = f"Summary Error: {str(e)}"
+    # Rephrase the entire content in manageable chunks
+    rephrased_text = ""
+    try:
+        chunk_size = 500  # Adjust this size based on model and resource limits
+        content_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
+        for chunk in content_chunks:
+            rephrased = rephraser(chunk)
+            rephrased_text += rephrased[0]['generated_text'] + " "
+    except Exception as e:
+        rephrased_text = f"Rephrase Error: {str(e)}"
+    # Sentiment analysis
+    try:
+        sentiment = sentiment_analyzer(content[:512])  # Limiting to 512 tokens for sentiment analysis
+        sentiment_text = sentiment[0]['label']
+    except Exception as e:
+        sentiment_text = f"Sentiment Analysis Error: {str(e)}"
+    # Extract keywords (for simplicity, extracting words here, but you can replace this with a better method)
+    keywords = ' '.join([word for word in content.split()[:10]])  # Sample, first 10 words as keywords
+    # Saving processed file (for download link)
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as temp_file:
+            temp_file.write(content.encode('utf-8'))
+            processed_file_path = temp_file.name
+    except Exception as e:
+        processed_file_path = f"Error saving processed document: {str(e)}"
+    return content, rephrased_text.strip(), summary_text, sentiment_text, keywords, processed_file_path
+# Set up Gradio interface
+iface = gr.Interface(
+    fn=process_file,
+    inputs=[
+        gr.File(label="Upload Document (PDF, DOCX, TXT, PPTX)"),
+        gr.Dropdown(["pdf", "docx", "txt", "pptx"], label="File Type"),
+    ],
+    outputs=[
+        gr.Textbox(label="Original Content"),
+        gr.Textbox(label="Rephrased Content"),
+        gr.Textbox(label="Summary"),
+        gr.Textbox(label="Sentiment Analysis"),
+        gr.Textbox(label="Keywords"),
+        gr.File(label="Download Processed Document")
+    ],
+    title="Enhanced Document Processor",
+    description="Upload a document to rephrase, summarize, analyze sentiment, extract keywords, and highlight key information. Supports PDF, DOCX, TXT, PPTX."
+)
+iface.launch()