Spaces:

ZennyKenny
/

Novoyaz

Sleeping

App Files Files Community

ZennyKenny commited on Aug 12

Commit

0f28e05

verified ·

1 Parent(s): bb8ebf0

Update app.py

Browse files

Files changed (1) hide show

app.py +244 -94

app.py CHANGED Viewed

@@ -1,109 +1,259 @@
-# app.py
-import spaces  # must be first
 import traceback
 from io import BytesIO
-from typing import Tuple
 import gradio as gr
 import requests
 import torch
 from huggingface_hub import snapshot_download
-from PIL import Image
 from qwen_vl_utils import process_vision_info
-from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
-# --- Config ---
-OCR_REPO = "rednote-hilab/dots.ocr"
-OCR_LOCAL = "./models/dots-ocr-local"
-CONVERT_REPO = "ZennyKenny/oss-20b-prereform-to-modern-ru-merged"
-SYSTEM_MSG = (
-    "You convert Russian text from pre-1918 orthography to modern Russian spelling. "
-    "Keep wording and punctuation; change only orthography."
-)
-OCR_PROMPT = (
-    "Extract the original text from this image as plain text. "
-    "Keep the reading order. Do not translate. Do not add extra formatting."
-)
-# --- Snapshot OCR locally (same technique as the working Space) ---
-snapshot_download(repo_id=OCR_REPO, local_dir=OCR_LOCAL, local_dir_use_symlinks=False)
-# --- Load models at module scope (after spaces import) ---
-# Expecting flash-attn to be available & ABI-compatible now
-ocr_model = AutoModelForCausalLM.from_pretrained(
-    OCR_LOCAL,
-    attn_implementation="flash_attention_2",
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else "auto",
-    device_map="auto",
-    trust_remote_code=True,
-)
-ocr_processor = AutoProcessor.from_pretrained(OCR_LOCAL, trust_remote_code=True)
-tok = AutoTokenizer.from_pretrained(CONVERT_REPO, use_fast=True)
-conv_model = AutoModelForCausalLM.from_pretrained(
-    CONVERT_REPO,
-    device_map="auto",
-    torch_dtype="auto",
-)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-def fetch_image(x) -> Image.Image:
-    if isinstance(x, Image.Image):
-        return x.convert("RGB")
-    if isinstance(x, str):
-        if x.startswith(("http://", "https://")):
-            r = requests.get(x, timeout=30); r.raise_for_status()
-            return Image.open(BytesIO(r.content)).convert("RGB")
-        return Image.open(x).convert("RGB")
-    raise ValueError(f"Unsupported input: {type(x)}")
-def run_ocr(img: Image.Image) -> str:
-    messages = [{"role":"user","content":[{"type":"image","image":img},{"type":"text","text":OCR_PROMPT}]}]
-    text = ocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
-    inputs = ocr_processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out = ocr_model.generate(**inputs, max_new_tokens=4096, do_sample=False, temperature=0.0)
-    trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out)]
-    s = ocr_processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    return s.strip()
-def convert_pre_to_modern(txt: str) -> str:
-    messages = [{"role":"system","content":SYSTEM_MSG},{"role":"user","content":txt}]
-    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tok([prompt], return_tensors="pt").to(conv_model.device)
     with torch.no_grad():
-        gen = conv_model.generate(**inputs, max_new_tokens=1024, do_sample=False, temperature=0.0, repetition_penalty=1.05)
-    gen_only = gen[0][inputs["input_ids"].shape[1]:]
-    return tok.decode(gen_only, skip_special_tokens=True).strip()
-@spaces.GPU()
-def transcribe_and_convert(image_in) -> Tuple[Image.Image, str, str, str]:
     try:
-        img = fetch_image(image_in)
-        ocr_text = run_ocr(img)
-        modern = convert_pre_to_modern(ocr_text)
-        md = f"```text\n{modern}\n```"
-        return img, ocr_text, modern, md
-    except Exception as e:
-        traceback.print_exc()
-        return None, "", "", f"Error: {e}"
-with gr.Blocks(title="Pre-reform → Modern Russian (OCR + Conversion)") as demo:
-    gr.Markdown("Upload an image with pre-1918 Russian → OCR (dots.ocr) → convert to modern Russian.")
-    with gr.Row():
-        with gr.Column(scale=1):
-            img_in = gr.Image(type="pil", label="Upload image")
-            btn = gr.Button("Transcribe & Convert", variant="primary")
-        with gr.Column(scale=2):
-            with gr.Row():
-                img_out = gr.Image(label="Preview", interactive=False)
-                ocr_box = gr.Textbox(label="Transcribed (pre-reform)", lines=14)
-                modern_box = gr.Textbox(label="Modern Russian", lines=14)
-            md_box = gr.Markdown(label="Markdown block")
-    btn.click(transcribe_and_convert, [img_in], [img_out, ocr_box, modern_box, md_box], api_name="transcribe_convert")
-demo.queue(max_size=10).launch(server_name="0.0.0.0", server_port=7860, debug=True, show_error=True)

+import spaces
+import json
+import math
+import os
 import traceback
 from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+import re
+import fitz  # PyMuPDF
 import gradio as gr
 import requests
 import torch
 from huggingface_hub import snapshot_download
+from PIL import Image, ImageDraw, ImageFont
 from qwen_vl_utils import process_vision_info
+from transformers import AutoModelForCausalLM, AutoProcessor
+# Constants
+MIN_PIXELS = 3136
+MAX_PIXELS = 11289600
+IMAGE_FACTOR = 28
+# Prompts
+prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+1. Bbox format: [x1, y1, x2, y2]
+2. Layout Categories: ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+3. Text Extraction & Formatting Rules:
+    - Picture: Omit text.
+    - Formula: Format as LaTeX.
+    - Table: Format as HTML.
+    - Others: Format as Markdown.
+4. Output must be the original text with no translation, sorted in human reading order.
+5. Final output: single JSON object.
+"""
+# Utility functions
+def round_by_factor(number: int, factor: int) -> int:
+    return round(number / factor) * factor
+def smart_resize(height: int, width: int, factor: int = 28,
+                 min_pixels: int = 3136, max_pixels: int = 11289600):
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError("absolute aspect ratio must be smaller than 200")
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = round_by_factor(height / beta, factor)
+        w_bar = round_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = round_by_factor(height * beta, factor)
+        w_bar = round_by_factor(width * beta, factor)
+    return h_bar, w_bar
+def fetch_image(image_input, min_pixels=None, max_pixels=None):
+    if isinstance(image_input, str):
+        if image_input.startswith(("http://", "https://")):
+            response = requests.get(image_input)
+            image = Image.open(BytesIO(response.content)).convert('RGB')
+        else:
+            image = Image.open(image_input).convert('RGB')
+    elif isinstance(image_input, Image.Image):
+        image = image_input.convert('RGB')
+    else:
+        raise ValueError(f"Invalid image input type: {type(image_input)}")
+    if min_pixels is not None or max_pixels is not None:
+        min_pixels = min_pixels or MIN_PIXELS
+        max_pixels = max_pixels or MAX_PIXELS
+        height, width = smart_resize(image.height, image.width, factor=IMAGE_FACTOR,
+                                     min_pixels=min_pixels, max_pixels=max_pixels)
+        image = image.resize((width, height), Image.LANCZOS)
+    return image
+def load_images_from_pdf(pdf_path: str) -> List[Image.Image]:
+    images = []
+    try:
+        pdf_document = fitz.open(pdf_path)
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            mat = fitz.Matrix(2.0, 2.0)
+            pix = page.get_pixmap(matrix=mat)
+            img_data = pix.tobytes("ppm")
+            image = Image.open(BytesIO(img_data)).convert('RGB')
+            images.append(image)
+        pdf_document.close()
+    except Exception as e:
+        print(f"Error loading PDF: {e}")
+    return images
+def is_arabic_text(text: str) -> bool:
+    if not text:
+        return False
+    header_pattern = r'^#{1,6}\s+(.+)$'
+    paragraph_pattern = r'^(?!#{1,6}\s|!\[|```|\||\s*[-*+]\s|\s*\d+\.\s)(.+)$'
+    content_text = []
+    for line in text.split('\n'):
+        line = line.strip()
+        if not line:
+            continue
+        header_match = re.match(header_pattern, line, re.MULTILINE)
+        if header_match:
+            content_text.append(header_match.group(1))
+            continue
+        if re.match(paragraph_pattern, line, re.MULTILINE):
+            content_text.append(line)
+    if not content_text:
+        return False
+    combined_text = ' '.join(content_text)
+    arabic_chars = sum(1 for c in combined_text if '\u0600' <= c <= '\u06FF' or '\u0750' <= c <= '\u077F' or '\u08A0' <= c <= '\u08FF')
+    total_chars = sum(1 for c in combined_text if c.isalpha())
+    return total_chars > 0 and (arabic_chars / total_chars) > 0.5
+def layoutjson2md(image: Image.Image, layout_data: List[Dict], text_key='text') -> str:
+    import base64
+    markdown_lines = []
+    try:
+        sorted_items = sorted(layout_data, key=lambda x: (x.get('bbox', [0, 0, 0, 0])[1], x.get('bbox', [0, 0, 0, 0])[0]))
+        for item in sorted_items:
+            category = item.get('category', '')
+            text = item.get(text_key, '')
+            if category == 'Picture':
+                markdown_lines.append("![Image](Image detected)\n")
+            elif not text:
+                continue
+            elif category == 'Title':
+                markdown_lines.append(f"# {text}\n")
+            elif category == 'Section-header':
+                markdown_lines.append(f"## {text}\n")
+            elif category == 'Text':
+                markdown_lines.append(f"{text}\n")
+            elif category == 'List-item':
+                markdown_lines.append(f"- {text}\n")
+            elif category == 'Table':
+                markdown_lines.append(f"{text}\n")
+            elif category == 'Formula':
+                markdown_lines.append(f"$$\n{text}\n$$\n")
+            elif category == 'Caption':
+                markdown_lines.append(f"*{text}*\n")
+            elif category == 'Footnote':
+                markdown_lines.append(f"^{text}^\n")
+    except Exception as e:
+        print(f"Error converting to markdown: {e}")
+        return str(layout_data)
+    return "\n".join(markdown_lines)
+# Model
+model_id = "rednote-hilab/dots.ocr"
+model_path = "./models/dots-ocr-local"
+snapshot_download(repo_id=model_id, local_dir=model_path, local_dir_use_symlinks=False)
+model = AutoModelForCausalLM.from_pretrained(model_path, attn_implementation="flash_attention_2",
+                                             torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# State
+pdf_cache = {"images": [], "current_page": 0, "total_pages": 0, "file_type": None, "is_parsed": False, "results": []}
+@spaces.GPU()
+def inference(image: Image.Image, prompt: str, max_new_tokens=24000) -> str:
+    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(device)
     with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.1)
+    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    return output_text[0] if output_text else ""
+def process_image(image: Image.Image, min_pixels=None, max_pixels=None):
+    if min_pixels is not None or max_pixels is not None:
+        image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
+    raw_output = inference(image, prompt)
     try:
+        layout_data = json.loads(raw_output)
+        return layoutjson2md(image, layout_data), layout_data
+    except json.JSONDecodeError:
+        return raw_output, None
+def load_file_for_preview(file_path: str):
+    global pdf_cache
+    if not file_path or not os.path.exists(file_path):
+        return None, "No file selected"
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == '.pdf':
+        images = load_images_from_pdf(file_path)
+        pdf_cache.update({"images": images, "current_page": 0, "total_pages": len(images),
+                          "file_type": "pdf", "is_parsed": False, "results": []})
+        return images[0], f"Page 1 / {len(images)}"
+    else:
+        img = Image.open(file_path).convert('RGB')
+        pdf_cache.update({"images": [img], "current_page": 0, "total_pages": 1,
+                          "file_type": "image", "is_parsed": False, "results": []})
+        return img, "Page 1 / 1"
+def turn_page(direction: str):
+    global pdf_cache
+    if not pdf_cache["images"]:
+        return None, '<div class="page-info">No file loaded</div>', "No results yet"
+    if direction == "prev":
+        pdf_cache["current_page"] = max(0, pdf_cache["current_page"] - 1)
+    elif direction == "next":
+        pdf_cache["current_page"] = min(pdf_cache["total_pages"] - 1, pdf_cache["current_page"] + 1)
+    idx = pdf_cache["current_page"]
+    img = pdf_cache["images"][idx]
+    page_info_html = f'<div class="page-info">Page {idx + 1} / {pdf_cache["total_pages"]}</div>'
+    markdown_content = "Page not processed yet"
+    if pdf_cache["is_parsed"] and idx < len(pdf_cache["results"]):
+        markdown_content = pdf_cache["results"][idx]
+    if is_arabic_text(markdown_content):
+        markdown_content = gr.update(value=markdown_content, rtl=True)
+    return img, page_info_html, markdown_content
+def create_gradio_interface():
+    css = ".page-info {text-align: center;padding: 8px 16px;border-radius: 20px;font-weight: bold;margin: 10px 0;}"
+    with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+        gr.HTML("<h1 style='text-align:center'>🔍 Dot-OCR - Extracted Content Only</h1>")
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_input = gr.File(label="Upload Image or PDF", file_types=[".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".pdf"], type="filepath")
+                image_preview = gr.Image(label="Preview", type="pil", interactive=False, height=300)
+                with gr.Row():
+                    prev_page_btn = gr.Button("◀ Previous")
+                    page_info = gr.HTML('<div class="page-info">No file loaded</div>')
+                    next_page_btn = gr.Button("Next ▶")
+                process_btn = gr.Button("🚀 Process Document", variant="primary")
+                clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
+            with gr.Column(scale=2):
+                markdown_output = gr.Markdown(value="Click 'Process Document' to see extracted content...", height=500)
+        file_input.change(load_file_for_preview, inputs=file_input, outputs=[image_preview, page_info])
+        prev_page_btn.click(lambda: turn_page("prev"), outputs=[image_preview, page_info, markdown_output])
+        next_page_btn.click(lambda: turn_page("next"), outputs=[image_preview, page_info, markdown_output])
+        process_btn.click(lambda f: _process_document(f), inputs=file_input, outputs=[markdown_output])
+        clear_btn.click(lambda: (None, None, '<div class="page-info">No file loaded</div>', "Click 'Process Document' to see extracted content..."),
+                        outputs=[file_input, image_preview, page_info, markdown_output])
+    return demo
+def _process_document(file_path):
+    global pdf_cache
+    if not file_path:
+        return "Please upload a file first."
+    img, _ = load_file_for_preview(file_path)
+    results = []
+    for page_img in pdf_cache["images"]:
+        md_content, _ = process_image(page_img)
+        results.append(md_content)
+    pdf_cache["results"] = results
+    pdf_cache["is_parsed"] = True
+    combined_md = "\n\n---\n\n".join(results)
+    if is_arabic_text(combined_md):
+        return gr.update(value=combined_md, rtl=True)
+    return combined_md
+if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.queue(max_size=10).launch(server_name="0.0.0.0", server_port=7860)