Spaces:

DRIVER-DETECT
/

DRIVER_DETECT

Sleeping

App Files Files Community

Wewoo commited on Dec 5, 2025

Commit

a77c81d

verified ·

1 Parent(s): ed2fe46

Upload 2 files

Browse files

Files changed (2) hide show

app.py +92 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import gradio as gr
+from transformers import pipeline
+import PyPDF2
+# Model tốt nhất cho tiếng Anh + chạy được trên HF Free
+summarizer = pipeline(
+    "summarization",
+    model="google/pegasus-xsum"
+)
+def read_pdf(file_obj):
+    """Đọc PDF từ Gradio file object."""
+    try:
+        with open(file_obj.name, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            text = ""
+            for page in reader.pages:
+                t = page.extract_text()
+                if t:
+                    text += t + "\n"
+        return text
+    except Exception as e:
+        return None
+def chunk_text(text, max_chars=512):
+    """
+    Chia text thành đoạn nhỏ an toàn cho Pegasus.
+    Pegasus chịu tối đa ~512 tokens → nên dùng 512 chars để chắc chắn không crash.
+    """
+    chunks = []
+    text = text.strip()
+    while len(text) > max_chars:
+        # tìm dấu chấm gần nhất để chia tự nhiên
+        cut = text.rfind('.', 0, max_chars)
+        if cut == -1:
+            # không có dấu chấm → cắt cứng
+            cut = max_chars
+        chunk = text[:cut].strip()
+        if len(chunk) > 0:
+            chunks.append(chunk)
+        text = text[cut:].strip()
+    if len(text) > 0:
+        chunks.append(text)
+    return chunks
+def summarize_pdf(pdf_file):
+    if pdf_file is None:
+        return "Hãy upload một file PDF."
+    text = read_pdf(pdf_file)
+    if text is None:
+        return "Lỗi đọc PDF — có thể file bị mã hóa hoặc không phải PDF chuẩn."
+    text = text.strip()
+    if len(text) < 50:
+        return "PDF quá ngắn hoặc không có nội dung văn bản."
+    # chia nhỏ text → tránh lỗi CPU + tránh timeout HF
+    chunks = chunk_text(text)
+    summaries = []
+    for i, chunk in enumerate(chunks):
+        try:
+            result = summarizer(
+                chunk,
+                max_length=120,  # phù hợp Pegasus
+                min_length=20,
+                do_sample=False
+            )
+            summaries.append(result[0]["summary_text"])
+        except Exception as e:
+            summaries.append(f"[Lỗi khi tóm tắt đoạn {i+1}]: {e}")
+    # nối summaries lại
+    return "\n\n".join(summaries)
+with gr.Blocks() as demo:
+    gr.Markdown("## 📝 Tóm tắt PDF bằng AI (Pegasus + Gradio)")
+    pdf = gr.File(label="Upload PDF", file_types=[".pdf"])
+    output = gr.Textbox(lines=12, label="Kết quả tóm tắt")
+    btn = gr.Button("Tóm tắt PDF")
+    btn.click(fn=summarize_pdf, inputs=pdf, outputs=output)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers
+torch
+gradio
+PyPDF2
+sentencepiece
+accelerate