Spaces:
Sleeping
Sleeping
Commit
·
07ea4e8
1
Parent(s):
6761226
initialization
Browse files- .env +3 -0
- .gitignore +0 -0
- app.py +120 -0
- dockerfile +0 -0
- packages.txt +1 -0
- requirements.txt +9 -0
.env
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
API_KEY=123456
|
| 2 |
+
TYPHOON_OCR_API_KEY=sk-D1sMp4fivWSKNhnydFOvqFwzdxqK7OIsVoLHn6rN3komSG3L
|
| 3 |
+
TYPHOON_BASE_URL=https://api.opentyphoon.ai/v1
|
.gitignore
ADDED
|
File without changes
|
app.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException, Header
|
| 4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
from typhoon_ocr import ocr_document
|
| 6 |
+
from pdf2image import convert_from_bytes
|
| 7 |
+
from PIL import Image
|
| 8 |
+
from docx import Document
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
import gradio as gr
|
| 11 |
+
|
| 12 |
+
# --- Load env ---
|
| 13 |
+
load_dotenv()
|
| 14 |
+
API_KEY = os.getenv("API_KEY")
|
| 15 |
+
TYPHOON_API_KEY = os.getenv("TYPHOON_OCR_API_KEY")
|
| 16 |
+
TYPHOON_BASE_URL = os.getenv("TYPHOON_BASE_URL", "https://api.opentyphoon.ai/v1")
|
| 17 |
+
|
| 18 |
+
# --- FastAPI Init ---
|
| 19 |
+
app = FastAPI()
|
| 20 |
+
|
| 21 |
+
app.add_middleware(
|
| 22 |
+
CORSMiddleware,
|
| 23 |
+
allow_origins=["*"],
|
| 24 |
+
allow_methods=["*"],
|
| 25 |
+
allow_headers=["*"],
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# --- PDF to PIL images ---
|
| 29 |
+
def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
|
| 30 |
+
return convert_from_bytes(pdf_bytes)
|
| 31 |
+
|
| 32 |
+
# --- Core OCR Handler ---
|
| 33 |
+
def process_ocr(file_path: str, task_type: str, page_num: int):
|
| 34 |
+
md_result = ""
|
| 35 |
+
doc = Document()
|
| 36 |
+
|
| 37 |
+
if file_path.lower().endswith(".pdf"):
|
| 38 |
+
with open(file_path, "rb") as f:
|
| 39 |
+
images = pdf_to_images(f.read())
|
| 40 |
+
|
| 41 |
+
pages = range(len(images)) if page_num <= 0 else [page_num - 1]
|
| 42 |
+
|
| 43 |
+
for i in pages:
|
| 44 |
+
img = images[i]
|
| 45 |
+
result = ocr_document(img, task_type=task_type)
|
| 46 |
+
text = result if isinstance(result, str) else result.get("text", "")
|
| 47 |
+
md_result += f"\n## หน้า {i + 1}\n{text.strip()}\n"
|
| 48 |
+
doc.add_heading(f"หน้า {i + 1}", level=1)
|
| 49 |
+
doc.add_paragraph(text.strip())
|
| 50 |
+
else:
|
| 51 |
+
result = ocr_document(file_path, task_type=task_type)
|
| 52 |
+
text = result if isinstance(result, str) else result.get("text", "")
|
| 53 |
+
md_result = text.strip()
|
| 54 |
+
doc.add_paragraph(text.strip())
|
| 55 |
+
|
| 56 |
+
# Save .md
|
| 57 |
+
md_file = tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8")
|
| 58 |
+
md_file.write(md_result)
|
| 59 |
+
md_file.close()
|
| 60 |
+
|
| 61 |
+
# Save .docx
|
| 62 |
+
docx_file = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
|
| 63 |
+
doc.save(docx_file.name)
|
| 64 |
+
|
| 65 |
+
return md_result, md_file.name, docx_file.name
|
| 66 |
+
|
| 67 |
+
# --- API Endpoint ---
|
| 68 |
+
@app.post("/api/ocr_document")
|
| 69 |
+
async def ocr_endpoint(
|
| 70 |
+
file: UploadFile = File(...),
|
| 71 |
+
task_type: str = "default",
|
| 72 |
+
page_num: int = -1,
|
| 73 |
+
x_api_key: str | None = Header(None)
|
| 74 |
+
):
|
| 75 |
+
if API_KEY and x_api_key != API_KEY:
|
| 76 |
+
raise HTTPException(status_code=401, detail="API key ผิดพ่อง")
|
| 77 |
+
|
| 78 |
+
try:
|
| 79 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[-1]) as tmp:
|
| 80 |
+
tmp.write(await file.read())
|
| 81 |
+
tmp_path = tmp.name
|
| 82 |
+
|
| 83 |
+
md, md_path, docx_path = process_ocr(tmp_path, task_type, page_num)
|
| 84 |
+
|
| 85 |
+
return {
|
| 86 |
+
"markdown": md,
|
| 87 |
+
"md_file": md_path,
|
| 88 |
+
"docx_file": docx_path,
|
| 89 |
+
"task_type": task_type,
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 94 |
+
|
| 95 |
+
# --- Gradio UI ---
|
| 96 |
+
def gradio_handler(file, task_type, page_num):
|
| 97 |
+
if file is None:
|
| 98 |
+
return "❌ อัปโหลดไฟล์ก่อน ไอ้สัส", None, None
|
| 99 |
+
try:
|
| 100 |
+
md, md_path, docx_path = process_ocr(file.name, task_type, page_num)
|
| 101 |
+
return md, md_path, docx_path
|
| 102 |
+
except Exception as e:
|
| 103 |
+
return f"⚠️ Error: {str(e)}", None, None
|
| 104 |
+
|
| 105 |
+
with gr.Blocks() as demo:
|
| 106 |
+
gr.Markdown("## 📄 Typhoon OCR (ทุกหน้า) | Text Extractor | PDF / JPG")
|
| 107 |
+
with gr.Row():
|
| 108 |
+
file_input = gr.File(label="📤 อัปโหลด PDF หรือ รูป", file_types=[".pdf", ".jpg", ".jpeg", ".png"])
|
| 109 |
+
page_input = gr.Number(value=-1, label="📄 เลขหน้า (ใส่ -1 เพื่อแปลงทุกหน้า)")
|
| 110 |
+
task_type = gr.Radio(["default", "structure"], label="📌 OCR Mode", value="default")
|
| 111 |
+
ocr_output = gr.Textbox(label="📋 ผล Markdown", lines=20)
|
| 112 |
+
with gr.Row():
|
| 113 |
+
btn = gr.Button("🚀 OCR Now!")
|
| 114 |
+
md_out = gr.File(label="📥 .md ไฟล์")
|
| 115 |
+
docx_out = gr.File(label="📥 .docx ไฟล์")
|
| 116 |
+
btn.click(fn=gradio_handler, inputs=[file_input, task_type, page_input], outputs=[ocr_output, md_out, docx_out])
|
| 117 |
+
|
| 118 |
+
demo.launch()
|
| 119 |
+
# หรือถ้าจะ mount เข้า FastAPI route
|
| 120 |
+
# app = gr.mount_gradio_app(app, demo, path="/ui")
|
dockerfile
ADDED
|
File without changes
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
poppler-utils
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
gradio
|
| 3 |
+
python-multipart
|
| 4 |
+
python-docx
|
| 5 |
+
typhoon-ocr
|
| 6 |
+
pdf2image
|
| 7 |
+
Pillow
|
| 8 |
+
python-dotenv
|
| 9 |
+
uvicorn
|