Spaces:

hoangthiencm
/

ht-math-web-backend

Running

App Files Files Community

hoangthiencm commited on Dec 27, 2025

Commit

0b6acc2

verified ·

1 Parent(s): d9c305d

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -84

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Backend API cho HT_MATH_WEB - Chạy trên Hugging Face Spaces (Docker Version)
-Phiên bản: 8.0 (Content-Based Stitching OCR - Overlap Algorithm)
 Tác giả: Hoàng Tấn Thiên
 """
@@ -33,6 +33,16 @@ except ImportError:
 except OSError:
     print("CRITICAL WARNING: pandoc binary not found in system path.")
 # --- SUPABASE ---
 try:
     from supabase import create_client, Client
@@ -47,7 +57,7 @@ GEMINI_API_KEYS = os.getenv("GEMINI_API_KEYS", "").split(",")
 GEMINI_MODELS = os.getenv("GEMINI_MODELS", "gemini-2.5-flash,gemini-1.5-pro").split(",")
 SUPABASE_URL = os.getenv("SUPABASE_URL", "")
 SUPABASE_KEY = os.getenv("SUPABASE_KEY", "")
-MAX_THREADS = int(os.getenv("MAX_THREADS", "5")) # Tăng thread để xử lý các mảnh cắt song song
 ADMIN_SECRET_KEY = os.getenv("ADMIN_SECRET_KEY", "admin123")
 # Setup Supabase
@@ -58,7 +68,7 @@ if SUPABASE_AVAILABLE and SUPABASE_URL and SUPABASE_KEY:
     except Exception as e:
         print(f"Warning: Không thể kết nối Supabase: {e}")
-app = FastAPI(title="HT_MATH_WEB API", version="8.0")
 app.add_middleware(
     CORSMiddleware,
@@ -112,82 +122,61 @@ def check_rate_limit(request: Request):
             print(f"[RateLimit] IP {client_ip} requesting too fast.")
     ip_rate_limits[client_ip] = now
-# ===== PROMPTS =====
-DIRECT_GEMINI_PROMPT_TEXT_ONLY = r"""**TRÍCH XUẤT VĂN BẢN THUẦN TÚY**
-⚠️ YÊU CẦU BẮT BUỘC:
-- PHẢI trích xuất TOÀN BỘ nội dung xuất hiện trong ảnh/PDF
-- KHÔNG được bỏ sót bất kỳ câu hỏi, ví dụ, bài tập nào
-- KỂ CẢ các câu nhỏ, câu phụ, chữ mờ, chữ sát lề
-- Nếu không chắc, VẪN PHẢI ghi lại nội dung nhìn thấy
-⚠️ NHIỆM VỤ:
-1. Trích xuất toàn bộ văn bản, giữ nguyên định dạng gốc.
-2. KHÔNG sử dụng LaTeX ($...$), giữ nguyên biểu thức toán dạng text (ví dụ: x^2 + 1 = 0).
-3. Đánh dấu tiêu đề bằng Markdown **đậm**.
-4. KHÔNG thêm lời giải thích.
-"""
-DIRECT_GEMINI_PROMPT_LATEX = r"""Bạn là công cụ trích xuất văn bản từ ảnh/PDF. NHIỆM VỤ: Chuyển đổi nội dung trong ảnh sang Markdown với công thức LaTeX.
-⚠️ YÊU CẦU CỐT LÕI - KHÔNG ĐƯỢC BỎ SÓT:
-- Đọc kỹ từng pixel, trích xuất TOÀN BỘ nội dung từ trên xuống dưới.
-- KHÔNG bỏ qua bất kỳ bài tập, hình vẽ, hoặc ghi chú nhỏ nào.
-- Nếu ảnh bị cắt ngang chữ, hãy cố gắng đoán và hoàn thiện từ đó dựa trên ngữ cảnh.
-⚠️ QUY TẮC LATEX (BẮT BUỘC):
-1. Mọi công thức toán PHẢI bọc trong dấu $. Ví dụ: $x^2$, $\frac{1}{2}$.
-2. KHÔNG dùng \[...\] hoặc \(...\).
-3. Luôn có khoảng trắng trước dấu $: "Cho hàm số $f(x)$..." (Đúng).
-⚠️ ĐỊNH DẠNG:
-- Giữ nguyên cấu trúc dòng, đoạn.
-- Tiêu đề in đậm: **Câu 1:**, **Bài tập:**.
-- Bảng biểu giữ nguyên Markdown Table.
-CHỈ TRẢ VỀ MARKDOWN. KHÔNG GIẢI THÍCH THÊM.
 """
-# ===== STITCHING ALGORITHM (QUAN TRỌNG) =====
 def stitch_text(text_a: str, text_b: str, min_overlap_chars: int = 20) -> str:
-    """
-    Thuật toán ghép nối nội dung dựa trên sự trùng lặp (Content-Based Stitching).
-    So sánh phần đuôi của text_a và phần đầu của text_b để tìm đoạn trùng khớp nhất.
-    """
     if not text_a: return text_b
     if not text_b: return text_a
     a_lines = text_a.splitlines()
     b_lines = text_b.splitlines()
-    # Chỉ so sánh N dòng cuối của A và N dòng đầu của B để tối ưu hiệu năng
-    # (Tránh việc so sánh toàn bộ văn bản nếu văn bản quá dài)
     scan_window = min(len(a_lines), len(b_lines), 30)
     best_overlap_idx = 0
-    # Quét từ overlap lớn nhất (scan_window) về 1
     for i in range(scan_window, 0, -1):
-        # Lấy i dòng cuối của A
         tail_a = "\n".join(a_lines[-i:]).strip()
-        # Lấy i dòng đầu của B
         head_b = "\n".join(b_lines[:i]).strip()
-        # Kiểm tra độ dài tối thiểu và nội dung trùng khớp
-        # .strip() giúp bỏ qua sự khác biệt về khoảng trắng thừa
         if len(tail_a) >= min_overlap_chars and tail_a == head_b:
             best_overlap_idx = i
-            break # Tìm thấy overlap lớn nhất thì dừng ngay (Greedy)
     if best_overlap_idx > 0:
-        # Tìm thấy overlap: Ghép A + B (bỏ đi phần đầu trùng lặp của B)
-        # print(f"[Stitch] Found overlap: {best_overlap_idx} lines")
         return text_a + "\n" + "\n".join(b_lines[best_overlap_idx:])
     else:
-        # Không tìm thấy overlap: Nối bình thường với dòng trống
-        # print("[Stitch] No overlap found, simple join")
         return text_a + "\n\n" + text_b
 # ===== HELPER FUNCTIONS =====
 def clean_latex_formulas(text: str) -> str:
-    return re.sub(r'\$\s+(.*?)\s+\$', lambda m: f'${m.group(1).strip()}$', text)
 def hash_password(password: str) -> str:
     return hashlib.sha256(password.encode()).hexdigest()
@@ -195,21 +184,69 @@ def hash_password(password: str) -> str:
 def verify_password(password: str, hashed: str) -> bool:
     return hash_password(password) == hashed
 # ===== API ENDPOINTS =====
 @app.get("/")
 @app.get("/health")
 async def root():
     pandoc_status = "Not Found"
     try:
         pandoc_status = pypandoc.get_pandoc_version()
-    except:
-        pass
     return {
         "status": "ok",
-        "service": "HT_MATH_WEB API v8.0 (Stitching OCR)",
         "keys_loaded": key_manager.get_key_count(),
-        "pandoc_version": pandoc_status
     }
 @app.get("/api/models")
@@ -270,61 +307,67 @@ async def upload_image(file: UploadFile = File(...)):
 # --- CORE CONVERT LOGIC ---
 async def process_image_with_gemini(image: Image.Image, model_id: str, prompt: str, max_retries: int = 3) -> str:
-    """Gửi 1 ảnh (hoặc mảnh ảnh) lên Gemini và nhận text"""
     for attempt in range(max_retries):
         try:
             api_key = key_manager.get_next_key()
             if not api_key: raise ValueError("No API Key")
             genai.configure(api_key=api_key)
-            generation_config = {"temperature": 0.0, "top_p": 1.0, "max_output_tokens": 8192}
             model = genai.GenerativeModel(model_id, generation_config=generation_config)
             response = model.generate_content([prompt, image])
-            if response.text:
-                return response.text.strip()
         except Exception as e:
             if "429" in str(e) and attempt < max_retries - 1:
                 time.sleep(2)
                 continue
-            if attempt == max_retries - 1: raise e
     return ""
 async def process_large_image(image: Image.Image, model: str, prompt: str, semaphore: asyncio.Semaphore) -> str:
     """
-    Xử lý ảnh lớn bằng kỹ thuật: Overlap Splitting + Content-Based Stitching
     """
-    # Cấu hình cắt ảnh
-    CHUNK_HEIGHT = 2048 # Chiều cao mỗi mảnh (pixel)
-    OVERLAP_HEIGHT = 512 # Chiều cao phần chồng lặp (pixel)
     width, height = image.size
-    # Nếu ảnh nhỏ hơn ngưỡng cắt, xử lý bình thường
     if height <= CHUNK_HEIGHT:
         async with semaphore:
             return await process_image_with_gemini(image, model, prompt)
-    # --- Cắt ảnh thành các mảnh có Overlap ---
     chunks = []
     y = 0
     while y < height:
-        # Xác định vùng cắt
         bottom = min(y + CHUNK_HEIGHT, height)
         box = (0, y, width, bottom)
         chunk = image.crop(box)
         chunks.append(chunk)
-        # Nếu đã đến đáy ảnh thì dừng
-        if bottom == height:
-            break
-        # Di chuyển y xuống, nhưng lùi lại một đoạn overlap
         y += (CHUNK_HEIGHT - OVERLAP_HEIGHT)
-    print(f"[Split] Image height {height}px -> {len(chunks)} chunks with overlap.")
-    # --- Gửi song song các mảnh lên Gemini ---
     async def process_chunk(chunk_img, index):
         async with semaphore:
             text = await process_image_with_gemini(chunk_img, model, prompt)
@@ -333,11 +376,9 @@ async def process_large_image(image: Image.Image, model: str, prompt: str, semap
     tasks = [process_chunk(chunk, i) for i, chunk in enumerate(chunks)]
     chunk_results = await asyncio.gather(*tasks)
-    # Sắp xếp lại theo đúng thứ tự
     chunk_results.sort(key=lambda x: x[0])
     ordered_texts = [text for _, text in chunk_results]
-    # --- Ghép nối thông minh (Stitching) ---
     final_text = ordered_texts[0]
     for i in range(1, len(ordered_texts)):
         final_text = stitch_text(final_text, ordered_texts[i], min_overlap_chars=20)
@@ -361,21 +402,15 @@ async def convert_file(
         file_content = await file.read()
         file_ext = os.path.splitext(file.filename)[1].lower()
-        # Global semaphore để kiểm soát tổng số request đồng thời lên Gemini
-        # Tránh lỗi 429 Quota Exceeded khi cắt quá nhiều mảnh
         global_semaphore = asyncio.Semaphore(MAX_THREADS)
         results = []
         if file_ext == ".pdf":
             doc = fitz.open(stream=file_content, filetype="pdf")
-            # Hàm xử lý từng trang (có thể cắt nhỏ bên trong nếu trang dài)
             async def process_page_wrapper(page, idx):
-                # Render ảnh chất lượng cao
                 pix = page.get_pixmap(dpi=300)
                 img = Image.open(io.BytesIO(pix.tobytes("png")))
-                # Gọi hàm xử lý ảnh lớn (tự động cắt/ghép nếu cần)
                 text = await process_large_image(img, model, prompt, global_semaphore)
                 return idx, text
@@ -386,7 +421,6 @@ async def convert_file(
         elif file_ext in [".png", ".jpg", ".jpeg", ".bmp"]:
             img = Image.open(io.BytesIO(file_content))
-            # Xử lý ảnh upload (tự động cắt/ghép nếu dài)
             text = await process_large_image(img, model, prompt, global_semaphore)
             results.append(text)
         else:
@@ -400,7 +434,6 @@ async def convert_file(
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=str(e))
-# --- WORD EXPORT API (PANDOC NATIVE) ---
 @app.post("/api/export-docx")
 async def export_docx(markdown_text: str = Form(...)):
     try:
@@ -418,7 +451,7 @@ async def export_docx(markdown_text: str = Form(...)):
         return FileResponse(
             output_filename,
             media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-            filename="Ket_qua_HT_MATH_Pandoc.docx"
         )
     except Exception as e:
         import traceback

 """
 Backend API cho HT_MATH_WEB - Chạy trên Hugging Face Spaces (Docker Version)
+Phiên bản: 9.0 (Copyright Safety & Fallback OCR)
 Tác giả: Hoàng Tấn Thiên
 """
 except OSError:
     print("CRITICAL WARNING: pandoc binary not found in system path.")
+# --- TESSERACT IMPORT (FALLBACK OCR) ---
+try:
+    import pytesseract
+    # Kiểm tra xem binary có tồn tại không (trong Docker thường ở /usr/bin/tesseract)
+    # pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
+    print("INFO: Tesseract OCR module loaded.")
+except ImportError:
+    print("WARNING: pytesseract not found. Fallback OCR will not work.")
+    pytesseract = None
 # --- SUPABASE ---
 try:
     from supabase import create_client, Client
 GEMINI_MODELS = os.getenv("GEMINI_MODELS", "gemini-2.5-flash,gemini-1.5-pro").split(",")
 SUPABASE_URL = os.getenv("SUPABASE_URL", "")
 SUPABASE_KEY = os.getenv("SUPABASE_KEY", "")
+MAX_THREADS = int(os.getenv("MAX_THREADS", "5"))
 ADMIN_SECRET_KEY = os.getenv("ADMIN_SECRET_KEY", "admin123")
 # Setup Supabase
     except Exception as e:
         print(f"Warning: Không thể kết nối Supabase: {e}")
+app = FastAPI(title="HT_MATH_WEB API", version="9.0")
 app.add_middleware(
     CORSMiddleware,
             print(f"[RateLimit] IP {client_ip} requesting too fast.")
     ip_rate_limits[client_ip] = now
+# ===== SAFE PROMPTS (ANTI-COPYRIGHT) =====
+# Đã đổi prompt để tránh trigger bản quyền (finish_reason=4)
+DIRECT_GEMINI_PROMPT_TEXT_ONLY = r"""Bạn là trợ lý AI hỗ trợ người khiếm thị số hóa tài liệu.
+Nhiệm vụ: Nhận diện và mô tả lại nội dung văn bản trong hình ảnh một cách trung thực, chính xác.
+Yêu cầu:
+- Trích xuất nội dung văn bản để lưu trữ dữ liệu.
+- Giữ nguyên các thông số, dữ liệu toán học.
+- Định dạng Markdown rõ ràng.
+- KHÔNG thêm lời bình luận.
+"""
+DIRECT_GEMINI_PROMPT_LATEX = r"""Bạn là trợ lý AI hỗ trợ số hóa tài liệu giáo dục.
+Nhiệm vụ: Phân tích hình ảnh và viết lại nội dung dưới dạng Markdown chuẩn Toán học (LaTeX).
+Mục đích: Số hóa để lưu trữ và giảng dạy, không nhằm mục đích sao chép thương mại.
+⚠️ YÊU CẦU XỬ LÝ AN TOÀN:
+1. Nếu gặp nội dung giống sách giáo khoa hoặc tài liệu có bản quyền, hãy VIẾT LẠI (paraphrase) lời dẫn nhưng GIỮ NGUYÊN công thức toán học và số liệu.
+2. Trình bày công thức toán trong dấu `$`. Ví dụ: $x^2 + 2x = 0$.
+3. Đảm bảo cấu trúc bài tập rõ ràng (Câu 1, Câu 2...).
+4. Nếu hình ảnh mờ hoặc bị cắt, hãy cố gắng phục hồi dựa trên ngữ cảnh toán học.
+CHỈ TRẢ VỀ NỘI DUNG MARKDOWN.
 """
+# ===== STITCHING ALGORITHM =====
 def stitch_text(text_a: str, text_b: str, min_overlap_chars: int = 20) -> str:
     if not text_a: return text_b
     if not text_b: return text_a
     a_lines = text_a.splitlines()
     b_lines = text_b.splitlines()
     scan_window = min(len(a_lines), len(b_lines), 30)
     best_overlap_idx = 0
     for i in range(scan_window, 0, -1):
         tail_a = "\n".join(a_lines[-i:]).strip()
         head_b = "\n".join(b_lines[:i]).strip()
         if len(tail_a) >= min_overlap_chars and tail_a == head_b:
             best_overlap_idx = i
+            break
     if best_overlap_idx > 0:
         return text_a + "\n" + "\n".join(b_lines[best_overlap_idx:])
     else:
         return text_a + "\n\n" + text_b
 # ===== HELPER FUNCTIONS =====
 def clean_latex_formulas(text: str) -> str:
+    # Chuẩn hóa khoảng trắng Latex
+    text = re.sub(r'\$\s+(.*?)\s+\$', lambda m: f'${m.group(1).strip()}$', text)
+    # Fix lỗi phổ biến khi OCR tiếng Việt bị dính ký tự
+    return text
 def hash_password(password: str) -> str:
     return hashlib.sha256(password.encode()).hexdigest()
 def verify_password(password: str, hashed: str) -> bool:
     return hash_password(password) == hashed
+def safe_get_text(response) -> str:
+    """
+    Trích xuất text an toàn từ Gemini Response.
+    Xử lý trường hợp bị chặn bản quyền (finish_reason=4).
+    """
+    if not response.candidates:
+        return ""
+    candidate = response.candidates[0]
+    # Kiểm tra lý do kết thúc
+    # 1 = STOP (OK), 4 = RECITING_FROM_COPYRIGHTED_MATERIAL (Blocked)
+    if candidate.finish_reason == 4:
+        return "[BLOCKED_BY_COPYRIGHT]"
+    if candidate.finish_reason != 1:
+        # Có thể log warning ở đây với các reason khác (như SAFETY)
+        return ""
+    # Nếu an toàn, trích xuất text
+    parts = candidate.content.parts
+    texts = [p.text for p in parts if hasattr(p, "text")]
+    return "\n".join(texts)
+async def fallback_ocr_tesseract(image: Image.Image) -> str:
+    """
+    Fallback dùng Tesseract OCR khi Gemini từ chối phục vụ
+    """
+    if pytesseract is None:
+        return "**[Lỗi] Gemini từ chối xử lý (Bản quyền) và Tesseract chưa được cài đặt.**"
+    print("[Fallback] Đang chạy Tesseract OCR...")
+    try:
+        # Chạy trong executor để không block event loop
+        loop = asyncio.get_running_loop()
+        # Chế độ: tiếng Việt + tiếng Anh + công thức toán (nếu train, ở đây dùng cơ bản)
+        text = await loop.run_in_executor(None, lambda: pytesseract.image_to_string(image, lang='vie+eng'))
+        return f"**[Lưu ý: Nội dung này được trích xuất bằng OCR dự phòng do Gemini chặn bản quyền]**\n\n{text}"
+    except Exception as e:
+        print(f"Tesseract Error: {e}")
+        return "**[Lỗi] Cả Gemini và Tesseract đều thất bại.**"
 # ===== API ENDPOINTS =====
 @app.get("/")
 @app.get("/health")
 async def root():
     pandoc_status = "Not Found"
+    tesseract_status = "Not Found"
     try:
         pandoc_status = pypandoc.get_pandoc_version()
+    except: pass
+    try:
+        if pytesseract: tesseract_status = "Ready"
+    except: pass
     return {
         "status": "ok",
+        "service": "HT_MATH_WEB API v9.0 (Safe Mode)",
         "keys_loaded": key_manager.get_key_count(),
+        "pandoc": pandoc_status,
+        "tesseract": tesseract_status
     }
 @app.get("/api/models")
 # --- CORE CONVERT LOGIC ---
 async def process_image_with_gemini(image: Image.Image, model_id: str, prompt: str, max_retries: int = 3) -> str:
+    """Gửi 1 ảnh (hoặc mảnh ảnh) lên Gemini và nhận text, có fallback"""
     for attempt in range(max_retries):
         try:
             api_key = key_manager.get_next_key()
             if not api_key: raise ValueError("No API Key")
             genai.configure(api_key=api_key)
+            generation_config = {"temperature": 0.2, "top_p": 1.0, "max_output_tokens": 8192}
             model = genai.GenerativeModel(model_id, generation_config=generation_config)
+            # Gọi API
             response = model.generate_content([prompt, image])
+            # Lấy text an toàn
+            text = safe_get_text(response)
+            # XỬ LÝ LỖI BẢN QUYỀN -> FALLBACK
+            if text == "[BLOCKED_BY_COPYRIGHT]":
+                print(f"Warning: Gemini blocked copyright content. Switching to fallback OCR...")
+                return await fallback_ocr_tesseract(image)
+            if text:
+                return text.strip()
         except Exception as e:
             if "429" in str(e) and attempt < max_retries - 1:
                 time.sleep(2)
                 continue
+            if attempt == max_retries - 1:
+                print(f"Error Gemini: {e}")
+                # Nếu lỗi mạng/quota quá nhiều, cũng fallback luôn cho chắc
+                return await fallback_ocr_tesseract(image)
     return ""
 async def process_large_image(image: Image.Image, model: str, prompt: str, semaphore: asyncio.Semaphore) -> str:
     """
+    Xử lý ảnh lớn: Cắt -> Gửi (kèm fallback) -> Ghép
     """
+    CHUNK_HEIGHT = 2048
+    OVERLAP_HEIGHT = 512
     width, height = image.size
     if height <= CHUNK_HEIGHT:
         async with semaphore:
             return await process_image_with_gemini(image, model, prompt)
+    # Cắt ảnh
     chunks = []
     y = 0
     while y < height:
         bottom = min(y + CHUNK_HEIGHT, height)
         box = (0, y, width, bottom)
         chunk = image.crop(box)
         chunks.append(chunk)
+        if bottom == height: break
         y += (CHUNK_HEIGHT - OVERLAP_HEIGHT)
+    print(f"[Split] Image height {height}px -> {len(chunks)} chunks.")
     async def process_chunk(chunk_img, index):
         async with semaphore:
             text = await process_image_with_gemini(chunk_img, model, prompt)
     tasks = [process_chunk(chunk, i) for i, chunk in enumerate(chunks)]
     chunk_results = await asyncio.gather(*tasks)
     chunk_results.sort(key=lambda x: x[0])
     ordered_texts = [text for _, text in chunk_results]
     final_text = ordered_texts[0]
     for i in range(1, len(ordered_texts)):
         final_text = stitch_text(final_text, ordered_texts[i], min_overlap_chars=20)
         file_content = await file.read()
         file_ext = os.path.splitext(file.filename)[1].lower()
         global_semaphore = asyncio.Semaphore(MAX_THREADS)
         results = []
         if file_ext == ".pdf":
             doc = fitz.open(stream=file_content, filetype="pdf")
             async def process_page_wrapper(page, idx):
                 pix = page.get_pixmap(dpi=300)
                 img = Image.open(io.BytesIO(pix.tobytes("png")))
                 text = await process_large_image(img, model, prompt, global_semaphore)
                 return idx, text
         elif file_ext in [".png", ".jpg", ".jpeg", ".bmp"]:
             img = Image.open(io.BytesIO(file_content))
             text = await process_large_image(img, model, prompt, global_semaphore)
             results.append(text)
         else:
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/api/export-docx")
 async def export_docx(markdown_text: str = Form(...)):
     try:
         return FileResponse(
             output_filename,
             media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            filename="HT_MATH_OUTPUT.docx"
         )
     except Exception as e:
         import traceback