File size: 5,711 Bytes
f9c46c3
2e40204
f9c46c3
 
9b9ef33
f9c46c3
2e40204
f9c46c3
 
 
 
ef48701
 
18c46de
f9c46c3
18c46de
f9c46c3
18c46de
f9c46c3
2e40204
f9c46c3
 
18c46de
2e40204
40cfd28
2e40204
 
 
 
f9c46c3
 
 
 
 
 
ef48701
 
f9c46c3
ef48701
 
f9c46c3
 
ef48701
f9c46c3
2e40204
f9c46c3
 
 
 
 
 
 
 
ef48701
 
 
 
 
f9c46c3
ef48701
f9c46c3
 
ef48701
f9c46c3
9b9ef33
 
ef48701
 
 
 
 
 
 
40cfd28
ef48701
 
 
9b9ef33
40cfd28
ef48701
 
40cfd28
ef48701
f9c46c3
ef48701
 
f9c46c3
40cfd28
 
 
 
 
 
ef48701
 
f9c46c3
9b9ef33
18c46de
 
9b9ef33
18c46de
9b9ef33
 
 
 
 
18c46de
 
 
 
2e40204
 
 
 
 
 
9b9ef33
2e40204
 
 
 
 
 
 
18c46de
 
2e40204
 
 
 
40cfd28
2e40204
f9c46c3
ef48701
f9c46c3
40cfd28
ef48701
f9c46c3
2e40204
18c46de
f9c46c3
 
40cfd28
2e40204
 
 
 
 
 
 
 
40cfd28
2e40204
 
40cfd28
f9c46c3
 
2e40204
 
 
 
f9c46c3
2e40204
9b9ef33
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
import fitz  # PyMuPDF
import torch
from transformers import pipeline
import time, logging, re, difflib
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import io
from PIL import Image

logging.basicConfig(level=logging.ERROR)
device = -1  # CPU-only
print("⚠️ CPU-only. Using faster models!")

# Load faster summarizer
try:
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device, torch_dtype=torch.float32)
except Exception as e:
    print(f"❌ Summarizer model loading failed: {str(e)}")
    exit(1)

# Load QA model
try:
    qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=device)
except Exception as e:
    print(f"❌ QA model loading failed: {str(e)}")
    exit(1)

def visualize_chunk_status(chunk_data):
    status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
    labels = [f"C{i['chunk']}" for i in chunk_data]
    colors = [status_colors.get(i['status'], 'gray') for i in chunk_data]
    times = [i.get('time', 0.1) for i in chunk_data]

    fig, ax = plt.subplots(figsize=(10, 2.5))
    ax.barh(labels, times, color=colors)
    ax.set_xlabel("Time (s)")
    ax.set_title("πŸ“Š Chunk Processing Status")
    plt.tight_layout()

    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    plt.close(fig)
    return Image.open(buf)

def summarize_file(file_bytes):
    start = time.time()
    chunk_info = []

    try:
        doc = fitz.open(stream=file_bytes, filetype="pdf")
        text = "".join(page.get_text("text") for page in doc)
        text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
        text = re.sub(r"\\cap", "intersection", text)
        text = re.sub(r"\s+", " ", text).strip()
        text = "".join(c for c in text if ord(c) < 128)
    except Exception as e:
        return f"❌ Text extraction failed: {str(e)}", None

    if not text.strip():
        return "❌ No text found", None

    text = text[:300000]  # allow full but reasonable size
    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
    summaries = []

    for i, chunk in enumerate(chunks):
        chunk_start = time.time()
        chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}

        if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
            summaries.append(f"### Chunk {i+1}: Skipped (equation-heavy)")
            chunk_result['status'] = 'skipped'
        else:
            try:
                summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
                summaries.append(f"### Chunk {i+1}\n{summary}")
                chunk_result['status'] = 'summarized'
            except Exception as e:
                summaries.append(f"### Chunk {i+1}: ❌ Error: {str(e)}")
                chunk_result['status'] = 'error'

        chunk_result['time'] = time.time() - chunk_start
        chunk_info.append(chunk_result)

    formatted_chunks = "\n\n---\n\n".join(summaries)
    final_summary = f"""**Characters Processed**: {len(text)}
**Total Time**: {time.time()-start:.2f} seconds
## πŸ”Ή Summary by Chunks
{formatted_chunks}
"""
    image = visualize_chunk_status(chunk_info)
    return final_summary, image

def find_relevant_passages(text, question, num_passages=5):
    passages = re.split(r'(?<=[.?!])\s+', text)
    scored = []
    question_tokens = set(question.lower().split())
    for passage in passages:
        passage_tokens = set(passage.lower().split())
        match_score = len(question_tokens.intersection(passage_tokens))
        if match_score == 0:
            match_score = difflib.SequenceMatcher(None, question, passage).ratio()
        scored.append((match_score, passage))
    scored.sort(reverse=True)
    best_passages = " ".join([p for _, p in scored[:num_passages]])
    return best_passages

def answer_question(file_bytes, question):
    try:
        doc = fitz.open(stream=file_bytes, filetype="pdf")
        text = "".join(page.get_text("text") for page in doc)
        text = re.sub(r"\s+", " ", text).strip()
        text = "".join(c for c in text if ord(c) < 128)
        context = text[:300000]
    except Exception as e:
        return f"❌ Text extraction failed: {str(e)}"

    if not question.strip():
        return "⚠️ Please enter a valid question."

    try:
        relevant_context = find_relevant_passages(context, question)
        result = qa_pipeline(question=question, context=relevant_context)
        return f"**Answer**: {result['answer']}\n\n**Score**: {result['score']:.2f}"
    except Exception as e:
        return f"❌ QA failed: {str(e)}"

# Summarizer UI
summarizer_ui = gr.Interface(
    fn=summarize_file,
    inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
    outputs=[
        gr.Textbox(label="πŸ“ Summarized Output", lines=30, show_copy_button=True),
        gr.Image(label="πŸ“Š Visual Process Flow", type="pil")
    ],
    title="πŸ“ AI-Powered PDF Summarizer",
    description="Summarizes long PDFs and visualizes chunk-level processing."
)

# Q&A UI
qa_ui = gr.Interface(
    fn=answer_question,
    inputs=[
        gr.File(label="πŸ“„ Upload PDF", type="binary"),
        gr.Textbox(label="❓ Ask a Question")
    ],
    outputs=gr.Textbox(label="πŸ” Answer"),
    title="πŸ“š PDF Q&A Assistant",
    description="Ask natural language questions from the uploaded PDF."
)

# Tabs
if __name__ == "__main__":
    try:
        gr.TabbedInterface(
            [summarizer_ui, qa_ui],
            ["πŸ“ Summarizer", "❓ Q&A Assistant"]
        ).launch(server_port=7860)
    except Exception as e:
        print(f"❌ Gradio launch failed: {str(e)}")