Tungdabiban commited on
Commit
e8e26ec
·
verified ·
1 Parent(s): 4bffbec

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +176 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import math
4
+ import time
5
+ import uuid
6
+ from fastapi import FastAPI, File, UploadFile, HTTPException
7
+ from fastapi.responses import StreamingResponse
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ import io
10
+ import fitz # PyMuPDF
11
+ import json
12
+ from transformers import pipeline
13
+ from typing import Iterator, Optional
14
+ import re
15
+
16
+ # Model name: default Vietnamese-optimized model with fallback for CPU usage on Hugging Face Free tier
17
+ MODEL_NAME = os.getenv("SUMMARIZER_MODEL_VI_VN", "VietAI/vit5-base-vietnamese")
18
+ # Optimized for CPU usage on Hugging Face Free tier
19
+ PRIMARY_VI_MODEL = MODEL_NAME
20
+ FALLBACK_MODEL = "google/mt5-small"
21
+
22
+ # Chunk and safety configuration (CPU-friendly), configurable via environment
23
+ CHUNK_WORDS = int(os.getenv("CHUNK_WORDS", "600")) # smaller chunks to reduce per-chunk compute
24
+ MAX_CHUNKS = int(os.getenv("MAX_CHUNKS", "20")) # safety limit to avoid long processing times
25
+
26
+ logger = logging.getLogger("pdf_summarizer")
27
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
28
+
29
+ app = FastAPI(title="PDF Summarizer with Streaming", version="0.1.0")
30
+
31
+ # CORS: allow all origins
32
+ app.add_middleware(
33
+ CORSMiddleware,
34
+ allow_origins=["*"],
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+
39
+ # Summarizer instance loaded at startup, reused for all requests
40
+ summarizer = None
41
+ current_model_name = None
42
+
43
+ @app.on_event("startup")
44
+ def load_model():
45
+ global summarizer, current_model_name
46
+ model_to_load = PRIMARY_VI_MODEL
47
+ current_model_name = model_to_load
48
+ try:
49
+ logger.info(f"Loading Vietnamese model for CPU: {model_to_load}")
50
+ summarizer = pipeline("summarization", model=model_to_load)
51
+ logger.info("Vietnamese model loaded successfully.")
52
+ except Exception as e:
53
+ logger.warning(f"Failed to load Vietnamese model ({model_to_load}) due to: {e}. Falling back to MT5-small.")
54
+ current_model_name = FALLBACK_MODEL
55
+ summarizer = pipeline("summarization", model=FALLBACK_MODEL)
56
+ logger.info("Fallback model MT5-small loaded.")
57
+
58
+ def pdf_bytes_to_text(pdf_bytes: bytes) -> str:
59
+ """
60
+ Read text from a PDF provided as memory bytes without writing to disk.
61
+ """
62
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
63
+ texts = []
64
+ for page in doc:
65
+ text = page.get_text("text")
66
+ if text:
67
+ texts.append(text)
68
+ doc.close()
69
+ return "\n".join(texts)
70
+
71
+ def finalize_sentence(text: str) -> str:
72
+ """
73
+ Ensure the final sentence ends with punctuation; if not, try to cut at last punctuation or append a period.
74
+ """
75
+ t = text.strip()
76
+ if not t:
77
+ return t
78
+ if t[-1] in ".!?":
79
+ return t
80
+ last_p = max(t.rfind("."), t.rfind("!"), t.rfind("?"))
81
+ if last_p != -1 and last_p < len(t) - 1:
82
+ return t[:last_p+1]
83
+ return t + "."
84
+
85
+ def iter_summaries(text: str, length_ratio: float, request_id: Optional[str] = None) -> Iterator[tuple[int, str, float]]:
86
+ """
87
+ Chunk text into ~800-word blocks and yield a summary for each chunk.
88
+ """
89
+ WORDS_PER_CHUNK = 800
90
+ words = text.split()
91
+ chunks = [" ".join(words[i:i+WORDS_PER_CHUNK]) for i in range(0, len(words), WORDS_PER_CHUNK)]
92
+
93
+ for idx, chunk in enumerate(chunks):
94
+ chunk_word_count = len(chunk.split())
95
+ # Length penalty scales with chunk size to balance brevity vs coverage
96
+ lp = 0.5 + min(1.5, (chunk_word_count / 1000) * 1.5)
97
+
98
+ # min_length and max_length proportional to chunk size and length_ratio
99
+ min_len = max(20, int(chunk_word_count * 0.05 * length_ratio))
100
+ max_len = max(min_len + 10, int(chunk_word_count * 0.25 * length_ratio))
101
+
102
+ try:
103
+ t0 = time.time()
104
+ result = summarizer(
105
+ chunk,
106
+ min_length=min_len,
107
+ max_length=max_len,
108
+ length_penalty=lp,
109
+ repetition_penalty=2.5,
110
+ no_repeat_ngram_size=3,
111
+ num_beams=4
112
+ )
113
+ duration = time.time() - t0
114
+ summary = result[0]["summary_text"] if isinstance(result, list) else result["summary_text"]
115
+ except Exception as e:
116
+ summary = f"[summarization error: {str(e)}]"
117
+ duration = 0.0
118
+
119
+ summary = finalize_sentence(summary)
120
+ yield idx, summary, duration
121
+
122
+ @app.post("/summarize")
123
+ async def summarize(pdf_file: UploadFile = File(...), length_ratio: float = 0.5):
124
+ """
125
+ Receive a PDF via memory (bytes) and return chunk-wise summaries as JSON Lines.
126
+ """
127
+ if pdf_file.content_type != "application/pdf":
128
+ raise HTTPException(status_code=400, detail="Only PDF files are supported.")
129
+ if not (0.1 <= length_ratio <= 1.0):
130
+ raise HTTPException(status_code=400, detail="length_ratio must be between 0.1 and 1.0")
131
+
132
+ pdf_bytes = await pdf_file.read()
133
+ text = pdf_bytes_to_text(pdf_bytes)
134
+ if not text.strip():
135
+ raise HTTPException(status_code=400, detail="PDF contains no readable text.")
136
+ # Safety guard: limit number of chunks to avoid long processing times on CPU/free tier
137
+ total_words = len(text.split())
138
+ chunk_count = math.ceil(total_words / CHUNK_WORDS) if CHUNK_WORDS > 0 else 1
139
+ logger.info(f"Document text length: {total_words} words; chunks: {chunk_count}")
140
+ if chunk_count > MAX_CHUNKS:
141
+ raise HTTPException(
142
+ status_code=400,
143
+ detail=f"Document too long: requires {chunk_count} chunks (max {MAX_CHUNKS}). Please reduce the PDF size or length_ratio.",
144
+ )
145
+
146
+ # Per-request identifiers and timing for enhanced logging
147
+ request_id = uuid.uuid4().hex
148
+ start_time = time.time()
149
+ logger.info(
150
+ f"Request {request_id}: starting. words={total_words}, chunks={chunk_count}, model={current_model_name}"
151
+ )
152
+
153
+ def gen() -> Iterator[bytes]:
154
+ durations = []
155
+ for idx, summary, duration in iter_summaries(text, length_ratio, request_id):
156
+ durations.append(duration)
157
+ avg = sum(durations) / len(durations) if durations else 0.0
158
+ remaining = max(0, chunk_count - idx - 1)
159
+ est_sec = remaining * avg
160
+ payload = {
161
+ "request_id": request_id,
162
+ "chunk": idx,
163
+ "summary": summary,
164
+ "estimate_seconds": round(est_sec, 2),
165
+ }
166
+ yield (json.dumps(payload) + "\n").encode("utf-8")
167
+ # Finalize logging after streaming completes
168
+ logger.info(
169
+ f"Request {request_id} finished: chunks={chunk_count}, total_words={total_words}, model={current_model_name}, duration={time.time()-start_time:.2f}s"
170
+ )
171
+
172
+ return StreamingResponse(gen(), media_type="application/jsonlines")
173
+
174
+ @app.get("/health")
175
+ async def health():
176
+ return {"status": "online"}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ PyMuPDF
5
+ transformers
6
+ torch
7
+ sentencepiece