File size: 2,540 Bytes
840cf52
7cd95b1
 
840cf52
b26f983
840cf52
7cd95b1
 
a725b8a
840cf52
7cd95b1
 
 
 
 
840cf52
f6b4c1e
7cd95b1
f6b4c1e
7cd95b1
b26f983
 
 
 
 
 
 
 
840cf52
7cd95b1
 
 
a725b8a
f6b4c1e
 
840cf52
 
b26f983
7cd95b1
 
b26f983
 
 
840cf52
a725b8a
7cd95b1
840cf52
7cd95b1
 
840cf52
f6b4c1e
 
 
a725b8a
f6b4c1e
840cf52
 
7cd95b1
 
 
 
840cf52
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import gradio as gr
import fitz
import torch
from transformers import pipeline
import time, logging, re

logging.basicConfig(level=logging.ERROR)
device = -1  # CPU-only
print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.")

try:
    summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
except Exception as e:
    print(f"❌ Model loading failed: {str(e)}")
    exit(1)

def summarize_file(file_bytes):
    start = time.time()
    print(f"File type: {type(file_bytes)}")
    try:
        doc = fitz.open(stream=file_bytes, filetype="pdf")
        text = "".join(page.get_text("text") for page in doc)
        # Clean OCR noise: replace LaTeX, remove excessive whitespace, non-ASCII
        text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)  # Strip $...$
        text = re.sub(r"\\cap", "intersection", text)  # Handle ∩
        text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
        text = "".join(c for c in text if ord(c) < 128)  # ASCII only
        print(f"Extracted chars: {len(text)}")
    except Exception as e:
        return f"❌ Text extraction failed: {str(e)}"
    if not text.strip(): return "❌ No text found"
    text = text[:300000]
    chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
    print(f"Chunks created: {len(chunks)}")
    if not chunks: return "❌ No chunks to summarize"
    summaries = []
    for i, chunk in enumerate(chunks):
        if time.time() - start > 20: 
            summaries.append("⚠️ Stopped early")
            break
        if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:  # Skip equation-heavy chunks
            summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
            continue
        try:
            summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
            summaries.append(f"**Chunk {i+1}**:\n{summary}")
        except Exception as e:
            summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
    return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)

demo = gr.Interface(
    fn=summarize_file, inputs=gr.File(label="πŸ“„ PDF/TXT Notes", type="binary"), 
    outputs=gr.Textbox(label="πŸ“ Summary"), 
    title="Fast Summarizer", description="300,000+ chars in ~20–30s (CPU)"
)

if __name__ == "__main__":
    try:
        demo.launch(share=False, server_port=7860)
    except Exception as e:
        print(f"❌ Gradio launch failed: {str(e)}")