Spaces:

izhan001
/

Smart-Doc-Processor

Build error

File size: 5,938 Bytes

import gradio as gr
import docx
import PyPDF2
from pptx import Presentation
from transformers import pipeline
from docx import Document
from io import BytesIO
import tempfile

# Initialize Hugging Face models for summarization, rephrasing, and sentiment analysis
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
rephraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws", max_length=512, truncation=True)
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Function to read content from different file types
def read_file(file, file_type):
    content = ""
    try:
        if file_type == "docx":
            doc = Document(file)
            for para in doc.paragraphs:
                content += para.text + "\n"
        elif file_type == "txt":
            content = file.read().decode("utf-8")
        elif file_type == "pdf":
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                content += page.extract_text() + "\n"
        elif file_type == "pptx":
            prs = Presentation(file)
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        content += shape.text + "\n"
    except Exception as e:
        content = f"Error reading the file: {str(e)}"
    
    return content

# Function to process the file and generate outputs
def process_file(file, file_type, language="en"):
    content = read_file(file, file_type)
    
    # Check if content is not empty
    if not content.strip() or "Error" in content:
        return "Error: The document is empty or unsupported format.", None, None, None, None, None
    
    # Summarize the content
    try:
        summary = summarizer(content, max_length=150, min_length=50, do_sample=False)
        summary_text = summary[0]['summary_text']
    except Exception as e:
        summary_text = f"Summary Error: {str(e)}"

    # Rephrase the entire content in manageable chunks
    rephrased_text = ""
    try:
        chunk_size = 500
        content_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
        for chunk in content_chunks:
            rephrased = rephraser(chunk)
            rephrased_text += rephrased[0]['generated_text'] + " "
    except Exception as e:
        rephrased_text = f"Rephrase Error: {str(e)}"

    # Sentiment analysis
    try:
        sentiment = sentiment_analyzer(content[:512])
        sentiment_text = sentiment[0]['label']
    except Exception as e:
        sentiment_text = f"Sentiment Analysis Error: {str(e)}"

    # Extract keywords (for simplicity, extracting words here, but you can replace this with a better method)
    keywords = ' '.join([word for word in content.split()[:10]])

    # Saving processed file (for download link)
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as temp_file:
            temp_file.write(content.encode('utf-8'))
            processed_file_path = temp_file.name
    except Exception as e:
        processed_file_path = f"Error saving processed document: {str(e)}"

    return content, rephrased_text.strip(), summary_text, sentiment_text, keywords, processed_file_path

# Define the functions for the different pages
def home_page():
    with gr.Blocks() as home:
        # Header
        gr.Markdown("## Upload a Document to Process")

        # Menu bar as buttons
        with gr.Row():
            home_btn = gr.Button("Home")
            full_analysis_btn = gr.Button("Full Analysis", variant="primary")

        # Display content on home page
        gr.Markdown("Welcome to the Document Processor!")
        gr.Markdown("Upload your document here and click to view details on the 'Full Analysis' page.")
        
        # File upload and content output
        file_input = gr.File(label="Upload Document")
        content_output = gr.Textbox(label="Original Content")
        rephrased_output = gr.Textbox(label="Rephrased Content")
        
        def on_file_upload(file):
            if not file:
                return "No file uploaded.", None
            content, rephrased, _, _, _, _ = process_file(file, file_type="docx")
            return content, rephrased

        # Process file on upload
        file_input.change(on_file_upload, inputs=file_input, outputs=[content_output, rephrased_output])

    return home

def detailed_page():
    with gr.Blocks() as detailed:
        # Header
        gr.Markdown("## Detailed Analysis Page")
        
        # Menu bar as buttons
        with gr.Row():
            home_btn = gr.Button("Home", variant="primary")
            full_analysis_btn = gr.Button("Full Analysis")

        # File upload and processing components
        file_input = gr.File(label="Upload Document")
        file_type = gr.Dropdown(["pdf", "docx", "txt", "pptx"], label="File Type")
        keywords_output = gr.Textbox(label="Keywords")
        sentiment_output = gr.Textbox(label="Sentiment Analysis")
        download_link = gr.File(label="Download Processed Document")
        
        def on_file_upload(file, file_type):
            if not file:
                return "No file uploaded.", None, None, None
            _, _, _, sentiment, keywords, download_path = process_file(file, file_type)
            return keywords, sentiment, download_path

        # Process file on upload
        file_input.change(on_file_upload, inputs=[file_input, file_type], outputs=[keywords_output, sentiment_output, download_link])

        # Sample output or content for the detailed analysis page
        gr.Markdown("Here you will see detailed analysis outputs after document upload.")

    return detailed

# Main application interface with tabbed navigation
iface = gr.TabbedInterface([home_page(), detailed_page()], ["Home", "Full Analysis"])
iface.launch()