Spaces:
Sleeping
Sleeping
File size: 5,367 Bytes
7b9d5b1 47a8c64 7b9d5b1 47a8c64 62801eb 7b9d5b1 47a8c64 7b9d5b1 47a8c64 7b9d5b1 62801eb 47a8c64 7b9d5b1 47a8c64 62801eb 47a8c64 7b9d5b1 47a8c64 1ae4c5e 47a8c64 7b9d5b1 47a8c64 7b9d5b1 47a8c64 62801eb 7b9d5b1 47a8c64 62801eb 47a8c64 d65c22a 62801eb d65c22a 62801eb 7b9d5b1 62801eb 47a8c64 62801eb 47a8c64 62801eb 47a8c64 7b9d5b1 47a8c64 7b9d5b1 15f90ff d65c22a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import gradio as gr
import fitz
import torch
from transformers import pipeline
import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes
from PIL import Image
from io import BytesIO
from striprtf.striprtf import rtf_to_text
import tempfile
logging.basicConfig(level=logging.ERROR)
device = -1 # CPU-only
print("β οΈ CPU-only. Expect ~5β9s for 300,000 chars.")
try:
summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
except Exception as e:
print(f"β Model loading failed: {str(e)}")
exit(1)
def summarize_file_bytes(file_bytes, filename):
start = time.time()
try:
if not isinstance(file_bytes, bytes) or len(file_bytes) == 0:
return f"β {filename}: Invalid or empty file", ""
mime, _ = mimetypes.guess_type(filename) or ('text/plain', None)
text = ""
if mime == 'application/pdf':
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = "".join(page.get_text("text") for page in doc)
elif mime in ['text/plain', 'text/rtf']:
text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
elif mime in ['text/csv', 'application/vnd.ms-excel']:
text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten())
elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
doc = docx.Document(BytesIO(file_bytes))
text = " ".join(p.text for p in doc.paragraphs if p.text)
elif mime in ['image/jpeg', 'image/png']:
img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300))
text = pytesseract.image_to_string(img)
elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl')
text = " ".join(df.astype(str).values.flatten())
else:
text = textract.process(file_bytes).decode("utf-8", errors="ignore")
text = re.sub(r"[^\x20-\x7E]", "", text) # Printable ASCII only
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
text = re.sub(r"\\cap", "intersection", text)
text = re.sub(r"\s+", " ", text).strip()
if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50:
return f"β {filename}: Invalid or too short text", ""
print(f"Extracted chars for {filename}: {len(text)}")
except Exception as e:
return f"β {filename}: Text extraction failed: {str(e)}", ""
text = text[:300000]
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
print(f"Chunks for {filename}: {len(chunks)}")
if not chunks:
return f"β {filename}: No chunks to summarize", ""
selected_indices = [int(i * len(chunks) / 12) for i in range(12)] if len(chunks) >= 12 else list(range(len(chunks)))
summaries = []
for i in selected_indices:
chunk = chunks[i]
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7:
summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
continue
try:
summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text']
summaries.append(f"**Chunk {i+1}**:\n{summary}")
except Exception as e:
summaries.append(f"**Chunk {i+1}**: β Error: {str(e)}")
while len(summaries) < 12:
summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content")
summary_text = f"π **{filename}**\n**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])
return summary_text, summary_text
def summarize_multiple_files(*file_objs):
if not file_objs or not any(file_objs):
return "β No files uploaded", None
all_summaries = []
combined_text = ""
for file in file_objs[0] if isinstance(file_objs[0], list) else file_objs:
if not hasattr(file, 'read') or not hasattr(file, 'name'):
all_summaries.append(f"β Invalid file: Missing read() or name")
continue
filename = file.name.split("/")[-1]
file_bytes = file.read()
summary, raw_text = summarize_file_bytes(file_bytes, filename)
all_summaries.append(summary)
combined_text += f"\n\n{raw_text}\n" + "="*60 + "\n"
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f:
f.write(combined_text)
summary_file_path = f.name
return "\n\n".join(all_summaries), summary_file_path
demo = gr.Interface(
fn=summarize_multiple_files,
inputs=gr.File(label="π Upload Any File", type="binary", file_count="multiple"),
outputs=[
gr.Textbox(label="π Summary", lines=15, max_lines=100),
gr.File(label="π₯ Download Summary as .txt")
],
title="π Multi-File Summarizer",
description="Summarizes any file into exactly 15 lines. Download as .txt. ~5β9s for 300,000 chars (CPU)."
)
if __name__ == "__main__":
try:
demo.launch(share=False, server_port=7860)
except Exception as e:
print(f"β Gradio launch failed: {str(e)}")
|