Spaces:

Tungdabiban
/

Multi_source_Summarizer

Sleeping

App Files Files Community

Tungdabiban commited on 6 days ago

Commit

ff3a585

verified ·

1 Parent(s): 3f0c00b

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -109

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ Model: vinai/bartpho-syllable-base
 """
 import re
-import io
 from typing import Optional
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
@@ -13,13 +12,6 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 import fitz  # PyMuPDF
-# Optional: newspaper3k for URL extraction
-try:
-    from newspaper import Article
-    NEWSPAPER_AVAILABLE = True
-except ImportError:
-    NEWSPAPER_AVAILABLE = False
 # ============================================================
 # Initialize FastAPI App
 # ============================================================
@@ -29,7 +21,7 @@ app = FastAPI(
     version="1.0.0"
 )
-# CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -53,15 +45,14 @@ print("Model loaded successfully!")
 # Request Models
 # ============================================================
 class SummarizeRequest(BaseModel):
-    text: Optional[str] = None
-    url: Optional[str] = None
 # ============================================================
 # Helper Functions
 # ============================================================
-def chunk_text_by_words(text: str, max_words: int = 700) -> list[str]:
     """
     Chia văn bản thành các đoạn tối đa max_words từ.
     Giữ nguyên câu hoàn chỉnh khi có thể.
@@ -82,23 +73,19 @@ def chunk_text_by_words(text: str, max_words: int = 700) -> list[str]:
         # Nếu câu đơn lẻ dài hơn max_words, chia nhỏ câu đó
         if sentence_word_count > max_words:
-            # Lưu chunk hiện tại trước
             if current_chunk:
                 chunks.append(' '.join(current_chunk))
                 current_chunk = []
                 current_word_count = 0
-            # Chia câu dài thành các phần
             for i in range(0, sentence_word_count, max_words):
                 chunk_words = sentence_words[i:i + max_words]
                 chunks.append(' '.join(chunk_words))
         # Nếu thêm câu này vượt quá giới hạn
         elif current_word_count + sentence_word_count > max_words:
-            # Lưu chunk hiện tại
             if current_chunk:
                 chunks.append(' '.join(current_chunk))
-            # Bắt đầu chunk mới với câu này
             current_chunk = [sentence]
             current_word_count = sentence_word_count
@@ -113,46 +100,39 @@ def chunk_text_by_words(text: str, max_words: int = 700) -> list[str]:
     return chunks
-def fix_truncated_sentence(text: str) -> str:
     """
-    Xử lý câu bị cụt ở cuối.
-    - Nếu câu cuối không kết thúc bằng dấu câu, thêm dấu chấm
-    - Hoặc xóa câu bị cụt nếu quá ngắn
     """
     text = text.strip()
     if not text:
         return text
-    # Kiểm tra nếu kết thúc bằng dấu câu
     if text[-1] in '.!?':
         return text
-    # Tìm câu cuối cùng hoàn chỉnh
-    last_sentence_end = max(
-        text.rfind('.'),
-        text.rfind('!'),
-        text.rfind('?')
-    )
     if last_sentence_end > 0:
-        # Lấy phần sau dấu câu cuối
-        incomplete_part = text[last_sentence_end + 1:].strip()
-        # Nếu phần không hoàn chỉnh quá ngắn (ít hơn 5 từ), xóa nó
-        if len(incomplete_part.split()) < 5:
-            return text[:last_sentence_end + 1]
-        else:
-            # Thêm dấu chấm để kết thúc
-            return text + '.'
     # Nếu không có dấu câu nào, thêm dấu chấm
     return text + '.'
-def format_as_bullet_points(summaries: list[str]) -> list[str]:
     """
-    Chuyển đổi các đoạn tóm tắt thành danh sách bullet points.
     """
     bullet_points = []
@@ -162,17 +142,18 @@ def format_as_bullet_points(summaries: list[str]) -> list[str]:
         for sentence in sentences:
             sentence = sentence.strip()
-            if sentence and len(sentence) > 10:  # Bỏ qua câu quá ngắn
                 # Đảm bảo câu kết thúc đúng
-                sentence = fix_truncated_sentence(sentence)
-                bullet_points.append(sentence)
-    return bullet_points
 def generate_summary(text: str) -> str:
     """
-    Sinh tóm tắt với các tham số chống cụt.
     """
     try:
         # Tokenize input
@@ -183,21 +164,25 @@ def generate_summary(text: str) -> str:
             return_tensors="pt"
         )
-        # Generate summary
         with torch.no_grad():
             summary_ids = model.generate(
                 inputs["input_ids"],
                 attention_mask=inputs["attention_mask"],
-                max_length=300,
                 min_length=100,
-                no_repeat_ngram_size=3,
-                repetition_penalty=2.5,
                 num_beams=4,
                 early_stopping=True
             )
         # Decode output
         summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
         return summary
     except Exception as e:
         print(f"Error generating summary: {e}")
@@ -206,10 +191,17 @@ def generate_summary(text: str) -> str:
 def summarize_long_text(text: str) -> list[str]:
     """
-    Tóm tắt văn bản dài bằng cách chia nhỏ và tóm tắt từng phần.
     """
-    # Chia văn bản thành các chunk 700 từ
-    chunks = chunk_text_by_words(text, max_words=700)
     summaries = []
     for i, chunk in enumerate(chunks):
@@ -221,40 +213,10 @@ def summarize_long_text(text: str) -> list[str]:
     return summaries
-def extract_text_from_url(url: str) -> str:
-    """
-    Trích xuất văn bản từ URL báo chí sử dụng newspaper3k.
-    """
-    if not NEWSPAPER_AVAILABLE:
-        raise HTTPException(
-            status_code=400,
-            detail="newspaper3k không được cài đặt. Vui lòng gửi text trực tiếp."
-        )
-    try:
-        article = Article(url, language='vi')
-        article.download()
-        article.parse()
-        text = article.text
-        if not text:
-            raise HTTPException(
-                status_code=400,
-                detail="Không thể trích xuất văn bản từ URL."
-            )
-        return text
-    except Exception as e:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Lỗi khi trích xuất URL: {str(e)}"
-        )
 def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
     """
     Đọc PDF từ byte stream sử dụng PyMuPDF.
-    Không lưu file ra ổ cứng.
     """
     try:
         # Mở PDF từ byte stream
@@ -278,6 +240,8 @@ def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
             )
         return full_text
     except Exception as e:
         raise HTTPException(
             status_code=400,
@@ -308,25 +272,12 @@ async def health_check():
 @app.post("/summarize")
 async def summarize_text(request: SummarizeRequest):
     """
-    Tóm tắt văn bản hoặc URL báo chí.
-    - Gửi `text` để tóm tắt văn bản trực tiếp
-    - Gửi `url` để trích xuất và tóm tắt bài báo
     """
-    # Validate input
-    if not request.text and not request.url:
-        raise HTTPException(
-            status_code=400,
-            detail="Vui lòng cung cấp 'text' hoặc 'url'."
-        )
-    # Get text from URL or use provided text
-    if request.url:
-        text = extract_text_from_url(request.url)
-    else:
-        text = request.text
-    # Validate text length
     if not text or len(text.strip()) < 50:
         raise HTTPException(
             status_code=400,
@@ -348,8 +299,9 @@ async def summarize_text(request: SummarizeRequest):
     return {
         "success": True,
         "original_length": len(text),
         "num_chunks": len(summaries),
-        "bullet_points": bullet_points
     }
@@ -357,9 +309,8 @@ async def summarize_text(request: SummarizeRequest):
 async def upload_pdf(file: UploadFile = File(...)):
     """
     Upload và tóm tắt file PDF.
-    - Đọc trực tiếp từ byte stream, không lưu file ra ổ cứng
-    - Hỗ trợ file PDF có text (không hỗ trợ ảnh scan)
     """
     # Validate file type
     if not file.filename.lower().endswith('.pdf'):
@@ -368,10 +319,10 @@ async def upload_pdf(file: UploadFile = File(...)):
             detail="Chỉ hỗ trợ file PDF."
         )
-    # Read file content directly from byte stream
-    pdf_bytes = await file.read()
-    if len(pdf_bytes) == 0:
         raise HTTPException(
             status_code=400,
             detail="File rỗng."
@@ -379,14 +330,14 @@ async def upload_pdf(file: UploadFile = File(...)):
     # Limit file size (10MB max)
     max_size = 10 * 1024 * 1024  # 10MB
-    if len(pdf_bytes) > max_size:
         raise HTTPException(
             status_code=400,
             detail="File quá lớn. Giới hạn 10MB."
         )
-    # Extract text from PDF bytes
-    text = extract_text_from_pdf_bytes(pdf_bytes)
     # Validate extracted text
     if len(text.strip()) < 50:
@@ -411,8 +362,9 @@ async def upload_pdf(file: UploadFile = File(...)):
         "success": True,
         "filename": file.filename,
         "original_length": len(text),
         "num_chunks": len(summaries),
-        "bullet_points": bullet_points
     }

 """
 import re
 from typing import Optional
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 import torch
 import fitz  # PyMuPDF
 # ============================================================
 # Initialize FastAPI App
 # ============================================================
     version="1.0.0"
 )
+# CORS middleware - Allow All Origins for GitHub Pages
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
 # Request Models
 # ============================================================
 class SummarizeRequest(BaseModel):
+    text: str
 # ============================================================
 # Helper Functions
 # ============================================================
+def chunk_text_by_words(text: str, max_words: int = 800) -> list[str]:
     """
     Chia văn bản thành các đoạn tối đa max_words từ.
     Giữ nguyên câu hoàn chỉnh khi có thể.
         # Nếu câu đơn lẻ dài hơn max_words, chia nhỏ câu đó
         if sentence_word_count > max_words:
             if current_chunk:
                 chunks.append(' '.join(current_chunk))
                 current_chunk = []
                 current_word_count = 0
             for i in range(0, sentence_word_count, max_words):
                 chunk_words = sentence_words[i:i + max_words]
                 chunks.append(' '.join(chunk_words))
         # Nếu thêm câu này vượt quá giới hạn
         elif current_word_count + sentence_word_count > max_words:
             if current_chunk:
                 chunks.append(' '.join(current_chunk))
             current_chunk = [sentence]
             current_word_count = sentence_word_count
     return chunks
+def fix_truncated_text(text: str) -> str:
     """
+    Nếu kết quả không kết thúc bằng dấu câu,
+    tự động cắt đến dấu chấm gần nhất.
     """
     text = text.strip()
     if not text:
         return text
+    # Nếu đã kết thúc bằng dấu câu, trả về nguyên
     if text[-1] in '.!?':
         return text
+    # Tìm dấu câu gần nhất
+    last_period = text.rfind('.')
+    last_exclaim = text.rfind('!')
+    last_question = text.rfind('?')
+    last_sentence_end = max(last_period, last_exclaim, last_question)
     if last_sentence_end > 0:
+        # Cắt đến dấu câu gần nhất
+        return text[:last_sentence_end + 1]
     # Nếu không có dấu câu nào, thêm dấu chấm
     return text + '.'
+def format_as_bullet_points(summaries: list[str]) -> str:
     """
+    Chuyển đổi các đoạn tóm tắt thành Bullet Points.
+    Mỗi ý một dòng, bắt đầu bằng '•'.
     """
     bullet_points = []
         for sentence in sentences:
             sentence = sentence.strip()
+            if sentence and len(sentence) > 15:  # Bỏ qua câu quá ng��n
                 # Đảm bảo câu kết thúc đúng
+                sentence = fix_truncated_text(sentence)
+                bullet_points.append(f"• {sentence}")
+    return '\n'.join(bullet_points)
 def generate_summary(text: str) -> str:
     """
+    Sinh tóm tắt với torch.no_grad() để tiết kiệm RAM.
+    Tham số: max_length=350, min_length=100, num_beams=4, repetition_penalty=2.5
     """
     try:
         # Tokenize input
             return_tensors="pt"
         )
+        # Generate với torch.no_grad() để tiết kiệm RAM
         with torch.no_grad():
             summary_ids = model.generate(
                 inputs["input_ids"],
                 attention_mask=inputs["attention_mask"],
+                max_length=350,
                 min_length=100,
                 num_beams=4,
+                repetition_penalty=2.5,
+                no_repeat_ngram_size=3,
                 early_stopping=True
             )
         # Decode output
         summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        # Fix truncated text
+        summary = fix_truncated_text(summary)
         return summary
     except Exception as e:
         print(f"Error generating summary: {e}")
 def summarize_long_text(text: str) -> list[str]:
     """
+    Nếu văn bản > 800 từ, chia nhỏ và tóm tắt từng phần.
     """
+    word_count = len(text.split())
+    # Nếu văn bản ngắn, tóm tắt trực tiếp
+    if word_count <= 800:
+        summary = generate_summary(text)
+        return [summary] if summary else []
+    # Chia nhỏ văn bản dài
+    chunks = chunk_text_by_words(text, max_words=800)
     summaries = []
     for i, chunk in enumerate(chunks):
     return summaries
 def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
     """
     Đọc PDF từ byte stream sử dụng PyMuPDF.
+    KHÔNG lưu file ra đĩa.
     """
     try:
         # Mở PDF từ byte stream
             )
         return full_text
+    except HTTPException:
+        raise
     except Exception as e:
         raise HTTPException(
             status_code=400,
 @app.post("/summarize")
 async def summarize_text(request: SummarizeRequest):
     """
+    Tóm tắt văn bản tiếng Việt.
+    Trả về danh sách Bullet Points.
     """
+    text = request.text
+    # Validate text
     if not text or len(text.strip()) < 50:
         raise HTTPException(
             status_code=400,
     return {
         "success": True,
         "original_length": len(text),
+        "word_count": len(text.split()),
         "num_chunks": len(summaries),
+        "summary": bullet_points
     }
 async def upload_pdf(file: UploadFile = File(...)):
     """
     Upload và tóm tắt file PDF.
+    Đọc qua byte stream, KHÔNG lưu file ra đĩa.
+    Trả về danh sách Bullet Points.
     """
     # Validate file type
     if not file.filename.lower().endswith('.pdf'):
             detail="Chỉ hỗ trợ file PDF."
         )
+    # Đọc file qua contents = await file.read()
+    contents = await file.read()
+    if len(contents) == 0:
         raise HTTPException(
             status_code=400,
             detail="File rỗng."
     # Limit file size (10MB max)
     max_size = 10 * 1024 * 1024  # 10MB
+    if len(contents) > max_size:
         raise HTTPException(
             status_code=400,
             detail="File quá lớn. Giới hạn 10MB."
         )
+    # Extract text from PDF bytes (dùng fitz.open(stream=contents, filetype='pdf'))
+    text = extract_text_from_pdf_bytes(contents)
     # Validate extracted text
     if len(text.strip()) < 50:
         "success": True,
         "filename": file.filename,
         "original_length": len(text),
+        "word_count": len(text.split()),
         "num_chunks": len(summaries),
+        "summary": bullet_points
     }