Spaces:
Sleeping
Sleeping
File size: 2,540 Bytes
840cf52 7cd95b1 840cf52 b26f983 840cf52 7cd95b1 a725b8a 840cf52 7cd95b1 840cf52 f6b4c1e 7cd95b1 f6b4c1e 7cd95b1 b26f983 840cf52 7cd95b1 a725b8a f6b4c1e 840cf52 b26f983 7cd95b1 b26f983 840cf52 a725b8a 7cd95b1 840cf52 7cd95b1 840cf52 f6b4c1e a725b8a f6b4c1e 840cf52 7cd95b1 840cf52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import gradio as gr
import fitz
import torch
from transformers import pipeline
import time, logging, re
logging.basicConfig(level=logging.ERROR)
device = -1 # CPU-only
print("β οΈ CPU-only. Expect ~20β30s for 300,000 chars.")
try:
summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
except Exception as e:
print(f"β Model loading failed: {str(e)}")
exit(1)
def summarize_file(file_bytes):
start = time.time()
print(f"File type: {type(file_bytes)}")
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = "".join(page.get_text("text") for page in doc)
# Clean OCR noise: replace LaTeX, remove excessive whitespace, non-ASCII
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text) # Strip $...$
text = re.sub(r"\\cap", "intersection", text) # Handle β©
text = re.sub(r"\s+", " ", text).strip() # Normalize whitespace
text = "".join(c for c in text if ord(c) < 128) # ASCII only
print(f"Extracted chars: {len(text)}")
except Exception as e:
return f"β Text extraction failed: {str(e)}"
if not text.strip(): return "β No text found"
text = text[:300000]
chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
print(f"Chunks created: {len(chunks)}")
if not chunks: return "β No chunks to summarize"
summaries = []
for i, chunk in enumerate(chunks):
if time.time() - start > 20:
summaries.append("β οΈ Stopped early")
break
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5: # Skip equation-heavy chunks
summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
continue
try:
summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
summaries.append(f"**Chunk {i+1}**:\n{summary}")
except Exception as e:
summaries.append(f"**Chunk {i+1}**: β Error: {str(e)}")
return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
demo = gr.Interface(
fn=summarize_file, inputs=gr.File(label="π PDF/TXT Notes", type="binary"),
outputs=gr.Textbox(label="π Summary"),
title="Fast Summarizer", description="300,000+ chars in ~20β30s (CPU)"
)
if __name__ == "__main__":
try:
demo.launch(share=False, server_port=7860)
except Exception as e:
print(f"β Gradio launch failed: {str(e)}")
|