Spaces:

Tungdabiban
/

Multi_source_Summarizer

Sleeping

App Files Files Community

Tungdabiban commited on Feb 5

Commit

864b667

verified ·

1 Parent(s): 2be45ba

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -40

app.py CHANGED Viewed

@@ -7,10 +7,14 @@ import re
 from typing import Optional
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 import fitz  # PyMuPDF
 # ============================================================
 # Initialize FastAPI App
@@ -303,26 +307,85 @@ async def summarize_text(request: SummarizeRequest):
             detail="Văn bản quá ngắn để tóm tắt (cần ít nhất 50 ký tự)."
         )
-    # Generate summaries
-    summaries = summarize_long_text(text)
-    if not summaries:
-        raise HTTPException(
-            status_code=500,
-            detail="Không thể tạo tóm tắt."
-        )
-    # Format as bullet points với giới hạn số lượng
-    bullet_points = format_as_bullet_points(summaries, max_points=max_points)
-    return {
-        "success": True,
-        "original_length": len(text),
-        "word_count": len(text.split()),
-        "num_chunks": len(summaries),
-        "length_level": length_level,
-        "summary": bullet_points
-    }
 @app.post("/upload-pdf")
@@ -330,7 +393,7 @@ async def upload_pdf(file: UploadFile = File(...), length_level: int = 1):
     """
     Upload và tóm tắt file PDF.
     Đọc qua byte stream, KHÔNG lưu file ra đĩa.
-    Trả về danh sách Bullet Points.
     length_level: 0 = Ngắn (2-3 ý), 1 = Trung bình (4-5 ý), 2 = Chi tiết (6+ ý)
     """
     # Map length_level to max_points
@@ -365,7 +428,7 @@ async def upload_pdf(file: UploadFile = File(...), length_level: int = 1):
             detail="File quá lớn. Giới hạn 10MB."
         )
-    # Extract text from PDF bytes (dùng fitz.open(stream=contents, filetype='pdf'))
     text = extract_text_from_pdf_bytes(contents)
     # Validate extracted text
@@ -375,27 +438,11 @@ async def upload_pdf(file: UploadFile = File(...), length_level: int = 1):
             detail="Văn bản trích xuất từ PDF quá ngắn."
         )
-    # Generate summaries
-    summaries = summarize_long_text(text)
-    if not summaries:
-        raise HTTPException(
-            status_code=500,
-            detail="Không thể tạo tóm tắt."
-        )
-    # Format as bullet points với giới hạn số lượng
-    bullet_points = format_as_bullet_points(summaries, max_points=max_points)
-    return {
-        "success": True,
-        "filename": file.filename,
-        "original_length": len(text),
-        "word_count": len(text.split()),
-        "num_chunks": len(summaries),
-        "length_level": length_level,
-        "summary": bullet_points
-    }
 # ============================================================

 from typing import Optional
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 import fitz  # PyMuPDF
+import json
+import gc
+import asyncio
 # ============================================================
 # Initialize FastAPI App
             detail="Văn bản quá ngắn để tóm tắt (cần ít nhất 50 ký tự)."
         )
+    # Return StreamingResponse
+    return StreamingResponse(
+        stream_summary_generator(text, max_points),
+        media_type="application/x-ndjson"
+    )
+async def stream_summary_generator(text: str, max_points: int = None):
+    """
+    Generator function to stream summary chunks.
+    Implements: Recursive chunking, Context-aware summarization, Memory optimization.
+    YIELDS JSON: {"text": "• Point 1\n", "done": False} + "\n"
+    """
+    chunks = chunk_text_by_words(text, max_words=800)
+    total_chunks = len(chunks)
+    # Context cho chunk tiếp theo (summary của chunk trước)
+    context_summary = ""
+    bullet_count = 0
+    for i, chunk in enumerate(chunks):
+        # 1. Prepare input: Context + Current Chunk
+        # Nếu có context, nối vào đầu chunk (có phân cách)
+        if context_summary:
+            # Giới hạn context để tránh quá dài (lấy 200 ký tự cuối của summary trước)
+            short_context = context_summary[-200:] if len(context_summary) > 200 else context_summary
+            input_text = f"Tóm tắt tiếp theo ngữ cảnh: {short_context}\nNội dung: {chunk}"
+        else:
+            input_text = chunk
+        # 2. Generate Summary
+        # Chạy trong threadpool để không chặn event loop của FastAPI
+        try:
+            summary_part = await asyncio.to_thread(generate_summary, input_text)
+        except Exception as e:
+            error_json = json.dumps({"error": str(e), "done": True})
+            yield error_json + "\n"
+            return
+        # 3. Format as bullets
+        # Chỉ lấy max_points còn lại nếu có giới hạn
+        points_limit = None
+        if max_points is not None:
+            points_limit = max_points - bullet_count
+            if points_limit <= 0:
+                break # Đã đủ số ý
+        bullets_text = format_as_bullet_points([summary_part], max_points=points_limit)
+        if bullets_text:
+            # Update context for next iteration
+            context_summary = summary_part.replace('\n', ' ')
+            # Count bullets
+            new_points = bullets_text.count('•')
+            bullet_count += new_points
+            # Yield Result
+            result_json = json.dumps({
+                "text": bullets_text + "\n",
+                "done": False,
+                "progress": int((i + 1) / total_chunks * 100)
+            })
+            yield result_json + "\n"
+        # 4. Memory Optimization
+        try:
+            del input_text
+            del summary_part
+        except UnboundLocalError:
+            pass
+        gc.collect() # Force garbage collection
+        # Nhường CPU cho request khác 1 chút
+        await asyncio.sleep(0.1)
+    # Final message
+    yield json.dumps({"text": "", "done": True, "progress": 100}) + "\n"
 @app.post("/upload-pdf")
     """
     Upload và tóm tắt file PDF.
     Đọc qua byte stream, KHÔNG lưu file ra đĩa.
+    Trả về StreamingResponse (NDJSON).
     length_level: 0 = Ngắn (2-3 ý), 1 = Trung bình (4-5 ý), 2 = Chi tiết (6+ ý)
     """
     # Map length_level to max_points
             detail="File quá lớn. Giới hạn 10MB."
         )
+    # Extract text from PDF bytes
     text = extract_text_from_pdf_bytes(contents)
     # Validate extracted text
             detail="Văn bản trích xuất từ PDF quá ngắn."
         )
+    # Return StreamingResponse
+    return StreamingResponse(
+        stream_summary_generator(text, max_points),
+        media_type="application/x-ndjson"
+    )
 # ============================================================