Spaces:

Tungdabiban
/

Multi_source_Summarizer

Sleeping

App Files Files Community

Tungdabiban commited on Feb 5

Commit

440b8dd

verified ·

1 Parent(s): 1810c40

Upload 2 files

Browse files

Files changed (2) hide show

app.py +412 -0
requirements.txt +9 -0

app.py ADDED Viewed

	@@ -0,0 +1,412 @@

+"""
+FastAPI Text Summarization App for Hugging Face CPU Space
+Model: vinai/bartpho-syllable-base
+"""
+import re
+import io
+from typing import Optional
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from transformers import pipeline
+import fitz  # PyMuPDF
+# Optional: newspaper3k for URL extraction
+try:
+    from newspaper import Article
+    NEWSPAPER_AVAILABLE = True
+except ImportError:
+    NEWSPAPER_AVAILABLE = False
+# ============================================================
+# Initialize FastAPI App
+# ============================================================
+app = FastAPI(
+    title="Vietnamese Text Summarizer",
+    description="Summarize Vietnamese text using BARTpho model",
+    version="1.0.0"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ============================================================
+# Load Model
+# ============================================================
+print("Loading BARTpho model...")
+summarizer = pipeline(
+    "summarization",
+    model="vinai/bartpho-syllable-base",
+    tokenizer="vinai/bartpho-syllable-base",
+    device=-1  # CPU
+)
+print("Model loaded successfully!")
+# ============================================================
+# Request Models
+# ============================================================
+class SummarizeRequest(BaseModel):
+    text: Optional[str] = None
+    url: Optional[str] = None
+# ============================================================
+# Helper Functions
+# ============================================================
+def chunk_text_by_words(text: str, max_words: int = 700) -> list[str]:
+    """
+    Chia văn bản thành các đoạn tối đa max_words từ.
+    Giữ nguyên câu hoàn chỉnh khi có thể.
+    """
+    # Clean text
+    text = re.sub(r'\s+', ' ', text).strip()
+    # Split into sentences
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = []
+    current_word_count = 0
+    for sentence in sentences:
+        sentence_words = sentence.split()
+        sentence_word_count = len(sentence_words)
+        # Nếu câu đơn lẻ dài hơn max_words, chia nhỏ câu đó
+        if sentence_word_count > max_words:
+            # Lưu chunk hiện tại trước
+            if current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = []
+                current_word_count = 0
+            # Chia câu dài thành các phần
+            for i in range(0, sentence_word_count, max_words):
+                chunk_words = sentence_words[i:i + max_words]
+                chunks.append(' '.join(chunk_words))
+        # Nếu thêm câu này vượt quá giới hạn
+        elif current_word_count + sentence_word_count > max_words:
+            # Lưu chunk hiện tại
+            if current_chunk:
+                chunks.append(' '.join(current_chunk))
+            # Bắt đầu chunk mới với câu này
+            current_chunk = [sentence]
+            current_word_count = sentence_word_count
+        else:
+            current_chunk.append(sentence)
+            current_word_count += sentence_word_count
+    # Lưu chunk cuối cùng
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks
+def fix_truncated_sentence(text: str) -> str:
+    """
+    Xử lý câu bị cụt ở cuối.
+    - Nếu câu cuối không kết thúc bằng dấu câu, thêm dấu chấm
+    - Hoặc xóa câu bị cụt nếu quá ngắn
+    """
+    text = text.strip()
+    if not text:
+        return text
+    # Kiểm tra nếu kết thúc bằng dấu câu
+    if text[-1] in '.!?':
+        return text
+    # Tìm câu cuối cùng hoàn chỉnh
+    last_sentence_end = max(
+        text.rfind('.'),
+        text.rfind('!'),
+        text.rfind('?')
+    )
+    if last_sentence_end > 0:
+        # Lấy phần sau dấu câu cuối
+        incomplete_part = text[last_sentence_end + 1:].strip()
+        # Nếu phần không hoàn chỉnh quá ngắn (ít hơn 5 từ), xóa nó
+        if len(incomplete_part.split()) < 5:
+            return text[:last_sentence_end + 1]
+        else:
+            # Thêm dấu chấm để kết thúc
+            return text + '.'
+    # Nếu không có dấu câu nào, thêm dấu chấm
+    return text + '.'
+def format_as_bullet_points(summaries: list[str]) -> list[str]:
+    """
+    Chuyển đổi các đoạn tóm tắt thành danh sách bullet points.
+    """
+    bullet_points = []
+    for summary in summaries:
+        # Chia thành các câu
+        sentences = re.split(r'(?<=[.!?])\s+', summary)
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if sentence and len(sentence) > 10:  # Bỏ qua câu quá ngắn
+                # Đảm bảo câu kết thúc đúng
+                sentence = fix_truncated_sentence(sentence)
+                bullet_points.append(sentence)
+    return bullet_points
+def generate_summary(text: str) -> str:
+    """
+    Sinh tóm tắt với các tham số chống cụt.
+    """
+    try:
+        result = summarizer(
+            text,
+            max_length=300,
+            min_length=100,
+            no_repeat_ngram_size=3,
+            repetition_penalty=2.5,
+            num_beams=4,
+            do_sample=False,
+            early_stopping=True
+        )
+        return result[0]['summary_text']
+    except Exception as e:
+        print(f"Error generating summary: {e}")
+        return ""
+def summarize_long_text(text: str) -> list[str]:
+    """
+    Tóm tắt văn bản dài bằng cách chia nhỏ và tóm tắt từng phần.
+    """
+    # Chia văn bản thành các chunk 700 từ
+    chunks = chunk_text_by_words(text, max_words=700)
+    summaries = []
+    for i, chunk in enumerate(chunks):
+        print(f"Processing chunk {i + 1}/{len(chunks)}...")
+        summary = generate_summary(chunk)
+        if summary:
+            summaries.append(summary)
+    return summaries
+def extract_text_from_url(url: str) -> str:
+    """
+    Trích xuất văn bản từ URL báo chí sử dụng newspaper3k.
+    """
+    if not NEWSPAPER_AVAILABLE:
+        raise HTTPException(
+            status_code=400,
+            detail="newspaper3k không được cài đặt. Vui lòng gửi text trực tiếp."
+        )
+    try:
+        article = Article(url, language='vi')
+        article.download()
+        article.parse()
+        text = article.text
+        if not text:
+            raise HTTPException(
+                status_code=400,
+                detail="Không thể trích xuất văn bản từ URL."
+            )
+        return text
+    except Exception as e:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Lỗi khi trích xuất URL: {str(e)}"
+        )
+def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
+    """
+    Đọc PDF từ byte stream sử dụng PyMuPDF.
+    Không lưu file ra ổ cứng.
+    """
+    try:
+        # Mở PDF từ byte stream
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        text_parts = []
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            text = page.get_text("text")
+            if text:
+                text_parts.append(text)
+        doc.close()
+        full_text = '\n'.join(text_parts)
+        if not full_text.strip():
+            raise HTTPException(
+                status_code=400,
+                detail="Không thể trích xuất văn bản từ PDF. File có thể là ảnh scan."
+            )
+        return full_text
+    except Exception as e:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Lỗi khi đọc PDF: {str(e)}"
+        )
+# ============================================================
+# API Endpoints
+# ============================================================
+@app.get("/")
+async def root():
+    """Health check endpoint."""
+    return {
+        "status": "running",
+        "model": "vinai/bartpho-syllable-base",
+        "endpoints": ["/summarize", "/upload-pdf"]
+    }
+@app.get("/health")
+async def health_check():
+    """Health check for Hugging Face Space."""
+    return {"status": "healthy"}
+@app.post("/summarize")
+async def summarize_text(request: SummarizeRequest):
+    """
+    Tóm tắt văn bản hoặc URL báo chí.
+    - Gửi `text` để tóm tắt văn bản trực tiếp
+    - Gửi `url` để trích xuất và tóm tắt bài báo
+    """
+    # Validate input
+    if not request.text and not request.url:
+        raise HTTPException(
+            status_code=400,
+            detail="Vui lòng cung cấp 'text' hoặc 'url'."
+        )
+    # Get text from URL or use provided text
+    if request.url:
+        text = extract_text_from_url(request.url)
+    else:
+        text = request.text
+    # Validate text length
+    if not text or len(text.strip()) < 50:
+        raise HTTPException(
+            status_code=400,
+            detail="Văn bản quá ngắn để tóm tắt (cần ít nhất 50 ký tự)."
+        )
+    # Generate summaries
+    summaries = summarize_long_text(text)
+    if not summaries:
+        raise HTTPException(
+            status_code=500,
+            detail="Không thể tạo tóm tắt."
+        )
+    # Format as bullet points
+    bullet_points = format_as_bullet_points(summaries)
+    return {
+        "success": True,
+        "original_length": len(text),
+        "num_chunks": len(summaries),
+        "bullet_points": bullet_points
+    }
+@app.post("/upload-pdf")
+async def upload_pdf(file: UploadFile = File(...)):
+    """
+    Upload và tóm tắt file PDF.
+    - Đọc trực tiếp từ byte stream, không lưu file ra ổ cứng
+    - Hỗ trợ file PDF có text (không hỗ trợ ảnh scan)
+    """
+    # Validate file type
+    if not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(
+            status_code=400,
+            detail="Chỉ hỗ trợ file PDF."
+        )
+    # Read file content directly from byte stream
+    pdf_bytes = await file.read()
+    if len(pdf_bytes) == 0:
+        raise HTTPException(
+            status_code=400,
+            detail="File rỗng."
+        )
+    # Limit file size (10MB max)
+    max_size = 10 * 1024 * 1024  # 10MB
+    if len(pdf_bytes) > max_size:
+        raise HTTPException(
+            status_code=400,
+            detail="File quá lớn. Giới hạn 10MB."
+        )
+    # Extract text from PDF bytes
+    text = extract_text_from_pdf_bytes(pdf_bytes)
+    # Validate extracted text
+    if len(text.strip()) < 50:
+        raise HTTPException(
+            status_code=400,
+            detail="Văn bản trích xuất từ PDF quá ngắn."
+        )
+    # Generate summaries
+    summaries = summarize_long_text(text)
+    if not summaries:
+        raise HTTPException(
+            status_code=500,
+            detail="Không thể tạo tóm tắt."
+        )
+    # Format as bullet points
+    bullet_points = format_as_bullet_points(summaries)
+    return {
+        "success": True,
+        "filename": file.filename,
+        "original_length": len(text),
+        "num_chunks": len(summaries),
+        "bullet_points": bullet_points
+    }
+# ============================================================
+# Run with Uvicorn (for local development)
+# ============================================================
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi>=0.104.0
+uvicorn>=0.24.0
+transformers>=4.35.0
+torch>=2.0.0
+sentencepiece>=0.1.99
+PyMuPDF>=1.23.0
+python-multipart>=0.0.6
+newspaper3k>=0.2.8
+lxml_html_clean>=0.1.0