File size: 5,938 Bytes
ddb93bd
 
 
 
 
 
 
 
 
 
840069e
ddb93bd
 
 
 
 
 
781ba9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddb93bd
 
 
 
 
840069e
ddb93bd
781ba9f
ddb93bd
840069e
ddb93bd
 
 
 
 
 
 
 
 
 
840069e
ddb93bd
 
 
 
 
 
 
 
 
840069e
ddb93bd
 
 
 
 
840069e
ddb93bd
 
 
 
 
 
 
 
 
 
 
840069e
 
 
 
781ba9f
ddb93bd
840069e
 
 
 
 
 
 
 
781ba9f
 
 
 
 
 
 
 
 
 
 
 
 
 
840069e
 
2063d0f
840069e
 
 
 
 
 
 
 
 
 
3cd943f
840069e
 
 
781ba9f
840069e
 
3cd943f
 
781ba9f
 
 
3cd943f
 
781ba9f
3cd943f
840069e
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
import docx
import PyPDF2
from pptx import Presentation
from transformers import pipeline
from docx import Document
from io import BytesIO
import tempfile

# Initialize Hugging Face models for summarization, rephrasing, and sentiment analysis
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
rephraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws", max_length=512, truncation=True)
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Function to read content from different file types
def read_file(file, file_type):
    content = ""
    try:
        if file_type == "docx":
            doc = Document(file)
            for para in doc.paragraphs:
                content += para.text + "\n"
        elif file_type == "txt":
            content = file.read().decode("utf-8")
        elif file_type == "pdf":
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                content += page.extract_text() + "\n"
        elif file_type == "pptx":
            prs = Presentation(file)
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        content += shape.text + "\n"
    except Exception as e:
        content = f"Error reading the file: {str(e)}"
    
    return content

# Function to process the file and generate outputs
def process_file(file, file_type, language="en"):
    content = read_file(file, file_type)
    
    # Check if content is not empty
    if not content.strip() or "Error" in content:
        return "Error: The document is empty or unsupported format.", None, None, None, None, None
    
    # Summarize the content
    try:
        summary = summarizer(content, max_length=150, min_length=50, do_sample=False)
        summary_text = summary[0]['summary_text']
    except Exception as e:
        summary_text = f"Summary Error: {str(e)}"

    # Rephrase the entire content in manageable chunks
    rephrased_text = ""
    try:
        chunk_size = 500
        content_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
        for chunk in content_chunks:
            rephrased = rephraser(chunk)
            rephrased_text += rephrased[0]['generated_text'] + " "
    except Exception as e:
        rephrased_text = f"Rephrase Error: {str(e)}"

    # Sentiment analysis
    try:
        sentiment = sentiment_analyzer(content[:512])
        sentiment_text = sentiment[0]['label']
    except Exception as e:
        sentiment_text = f"Sentiment Analysis Error: {str(e)}"

    # Extract keywords (for simplicity, extracting words here, but you can replace this with a better method)
    keywords = ' '.join([word for word in content.split()[:10]])

    # Saving processed file (for download link)
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as temp_file:
            temp_file.write(content.encode('utf-8'))
            processed_file_path = temp_file.name
    except Exception as e:
        processed_file_path = f"Error saving processed document: {str(e)}"

    return content, rephrased_text.strip(), summary_text, sentiment_text, keywords, processed_file_path

# Define the functions for the different pages
def home_page():
    with gr.Blocks() as home:
        # Header
        gr.Markdown("## Upload a Document to Process")

        # Menu bar as buttons
        with gr.Row():
            home_btn = gr.Button("Home")
            full_analysis_btn = gr.Button("Full Analysis", variant="primary")

        # Display content on home page
        gr.Markdown("Welcome to the Document Processor!")
        gr.Markdown("Upload your document here and click to view details on the 'Full Analysis' page.")
        
        # File upload and content output
        file_input = gr.File(label="Upload Document")
        content_output = gr.Textbox(label="Original Content")
        rephrased_output = gr.Textbox(label="Rephrased Content")
        
        def on_file_upload(file):
            if not file:
                return "No file uploaded.", None
            content, rephrased, _, _, _, _ = process_file(file, file_type="docx")
            return content, rephrased

        # Process file on upload
        file_input.change(on_file_upload, inputs=file_input, outputs=[content_output, rephrased_output])

    return home

def detailed_page():
    with gr.Blocks() as detailed:
        # Header
        gr.Markdown("## Detailed Analysis Page")
        
        # Menu bar as buttons
        with gr.Row():
            home_btn = gr.Button("Home", variant="primary")
            full_analysis_btn = gr.Button("Full Analysis")

        # File upload and processing components
        file_input = gr.File(label="Upload Document")
        file_type = gr.Dropdown(["pdf", "docx", "txt", "pptx"], label="File Type")
        keywords_output = gr.Textbox(label="Keywords")
        sentiment_output = gr.Textbox(label="Sentiment Analysis")
        download_link = gr.File(label="Download Processed Document")
        
        def on_file_upload(file, file_type):
            if not file:
                return "No file uploaded.", None, None, None
            _, _, _, sentiment, keywords, download_path = process_file(file, file_type)
            return keywords, sentiment, download_path

        # Process file on upload
        file_input.change(on_file_upload, inputs=[file_input, file_type], outputs=[keywords_output, sentiment_output, download_link])

        # Sample output or content for the detailed analysis page
        gr.Markdown("Here you will see detailed analysis outputs after document upload.")

    return detailed

# Main application interface with tabbed navigation
iface = gr.TabbedInterface([home_page(), detailed_page()], ["Home", "Full Analysis"])
iface.launch()