| | import streamlit as st |
| | import os |
| | import requests |
| | import json |
| | import base64 |
| | import uuid |
| | import re |
| | from pdf2image import convert_from_bytes |
| | from io import BytesIO |
| |
|
| | st.set_page_config(page_title="DotsOCR", page_icon="🔍", layout="wide") |
| |
|
| | |
| | if "total_pages" not in st.session_state: |
| | st.session_state.total_pages = 0 |
| | if "pdf_images" not in st.session_state: |
| | st.session_state.pdf_images = [] |
| | if "ocr_results" not in st.session_state: |
| | st.session_state.ocr_results = {} |
| | if "current_file_hash" not in st.session_state: |
| | st.session_state.current_file_hash = None |
| | if "task_id" not in st.session_state: |
| | st.session_state.task_id = None |
| |
|
| | |
| | PROXY_BASE_URL = os.getenv("PROXY_URL", "") |
| | MODEL_NAME = os.getenv("MODEL_NAME", "/model/DotsOCR") |
| |
|
| |
|
| | def upload_pdf_to_proxy(task_id, file_bytes, filename): |
| | try: |
| | if not PROXY_BASE_URL: |
| | return |
| | url = f"{PROXY_BASE_URL}/upload_pdf" |
| | files = {'file': (filename, file_bytes, 'application/pdf')} |
| | data = {'task_id': task_id} |
| | |
| | headers = {"ngrok-skip-browser-warning": "true"} |
| | requests.post(url, files=files, data=data, headers=headers, timeout=60) |
| | except: |
| | pass |
| |
|
| |
|
| | def image_to_base64(image): |
| | buffered = BytesIO() |
| | image.save(buffered, format="JPEG") |
| | return base64.b64encode(buffered.getvalue()).decode('utf-8') |
| |
|
| |
|
| | def call_proxy_api(task_id, page_num, image): |
| | b64_img = image_to_base64(image) |
| | system_prompt = "Parse the content of this image into markdown text." |
| |
|
| | payload = { |
| | "model": MODEL_NAME, |
| | "messages": [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "text", "text": system_prompt}, |
| | {"type": "image_url", "image_url": { |
| | "url": f"data:image/jpeg;base64,{b64_img}"}} |
| | ] |
| | } |
| | ], |
| | "max_tokens": 4096, |
| | "temperature": 0.1 |
| | } |
| |
|
| | headers = { |
| | "X-Task-ID": task_id, |
| | "X-Page-Num": str(page_num), |
| | "ngrok-skip-browser-warning": "true" |
| | } |
| |
|
| | try: |
| | url = f"{PROXY_BASE_URL}/v1/chat/completions" |
| | response = requests.post( |
| | url, json=payload, headers=headers, timeout=120) |
| |
|
| | if response.status_code == 200: |
| | res_json = response.json() |
| | content = res_json.get('choices', [{}])[0].get( |
| | 'message', {}).get('content', "") |
| | return True, content |
| | else: |
| | return False, f"Proxy Error {response.status_code}" |
| |
|
| | except Exception as e: |
| | return False, f"Connection Error: {str(e)}" |
| |
|
| |
|
| | def download_pdf_from_url(url): |
| | try: |
| | if "drive.google.com" in url and "/file/d/" in url: |
| | match = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url) |
| | if match: |
| | url = f"https://drive.google.com/uc?export=download&id={match.group(1)}" |
| | headers = {"User-Agent": "Mozilla/5.0"} |
| | resp = requests.get(url, headers=headers, stream=True, timeout=30) |
| | resp.raise_for_status() |
| | return resp.content, None |
| | except Exception as e: |
| | return None, str(e) |
| |
|
| |
|
| | |
| | st.title("🔍 DotsOCR Service") |
| |
|
| | with st.sidebar: |
| | st.header("Input Source") |
| | input_type = st.radio("Select:", ["Upload PDF", "PDF URL"]) |
| | pdf_bytes = None |
| | file_name = "document.pdf" |
| |
|
| | if input_type == "Upload PDF": |
| | uploaded_file = st.file_uploader("Choose file", type=["pdf"]) |
| | if uploaded_file: |
| | pdf_bytes = uploaded_file.read() |
| | file_name = uploaded_file.name |
| | else: |
| | url = st.text_input("URL:") |
| | if url and st.button("Fetch"): |
| | with st.spinner("Downloading..."): |
| | data, err = download_pdf_from_url(url) |
| | if data: |
| | pdf_bytes = data |
| | else: |
| | st.error("Download failed") |
| |
|
| | if pdf_bytes: |
| | |
| | if st.session_state.current_file_hash != hash(pdf_bytes): |
| | st.session_state.ocr_results = {} |
| | st.session_state.current_file_hash = hash(pdf_bytes) |
| | st.session_state.task_id = str(uuid.uuid4()) |
| |
|
| | |
| | upload_pdf_to_proxy(st.session_state.task_id, pdf_bytes, file_name) |
| |
|
| | |
| | with st.spinner("Initializing Document..."): |
| | try: |
| | images = convert_from_bytes(pdf_bytes, dpi=150) |
| | st.session_state.pdf_images = images |
| | st.session_state.total_pages = len(images) |
| | except: |
| | st.error("Failed to process PDF.") |
| | st.stop() |
| |
|
| | st.divider() |
| |
|
| | |
| | if st.session_state.total_pages > 0: |
| | |
| | page_idx = st.slider("Page Navigation", 1, |
| | st.session_state.total_pages, 1) - 1 |
| | current_image = st.session_state.pdf_images[page_idx] |
| |
|
| | col1, col2 = st.columns([1, 1]) |
| |
|
| | with col1: |
| | st.image(current_image, |
| | caption=f"Page {page_idx + 1}", use_container_width=True) |
| |
|
| | |
| | |
| | if page_idx not in st.session_state.ocr_results: |
| | with st.spinner(f"⚡ Auto-processing Page {page_idx + 1}..."): |
| | success, res = call_proxy_api( |
| | st.session_state.task_id, page_idx + 1, current_image) |
| |
|
| | |
| | st.session_state.ocr_results[page_idx] = res |
| |
|
| | |
| | st.rerun() |
| |
|
| | with col2: |
| | |
| | if page_idx in st.session_state.ocr_results: |
| | res_txt = st.session_state.ocr_results[page_idx] |
| |
|
| | |
| | if "Proxy Error" in res_txt or "Connection Error" in res_txt: |
| | st.error(f"❌ {res_txt}") |
| | |
| | if st.button("🔄 Retry Page"): |
| | del st.session_state.ocr_results[page_idx] |
| | st.rerun() |
| | else: |
| | st.success("✅ Analysis Complete") |
| | st.markdown(res_txt) |
| | with st.expander("Show Raw Text"): |
| | st.text_area("Raw", res_txt, height=200) |
| | else: |
| | st.info("Waiting for processor...") |
| |
|