document-extraction

Sleeping

App Files Files Community

kmuthudurai commited on Dec 11, 2024

Commit

dca3ec3

verified ·

1 Parent(s): d3f944d

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -69

app.py CHANGED Viewed

@@ -2,12 +2,17 @@ import uvicorn
 from fastapi.staticfiles import StaticFiles
 import hashlib
 from enum import Enum
-from fastapi import FastAPI, UploadFile, File
 from paddleocr import PaddleOCR, PPStructure, save_structure_res
 from PIL import Image
 import io
 import numpy as np
 import fitz  # PyMuPDF for PDF handling
 app = FastAPI(docs_url='/')
 use_gpu = False
@@ -17,10 +22,10 @@ class LangEnum(str, Enum):
     ch = "ch"
     en = "en"
-# cache with ocr
 ocr_cache = {}
-# get ocr instance
 def get_ocr(lang, use_gpu=False):
     if not ocr_cache.get(lang):
         ocr_cache[lang] = PaddleOCR(use_angle_cls=True, lang=lang, use_gpu=use_gpu)
@@ -29,85 +34,54 @@ def get_ocr(lang, use_gpu=False):
 # Function to extract images from PDF
 def pdf_to_images(file_contents):
-    doc = fitz.open(io.BytesIO(file_contents))
-    images = []
-    for page in doc:
-        pix = page.get_pixmap()
-        img = Image.open(io.BytesIO(pix.tobytes("png")))
-        images.append(img)
-    return images
 @app.post("/ocr")
 async def create_upload_file(
     file: UploadFile = File(...),
     lang: LangEnum = LangEnum.ch,
 ):
-    contents = await file.read()
-    # Determine if the uploaded file is a PDF or image
-    if file.content_type == "application/pdf":
-        images = pdf_to_images(contents)
-    else:
-        # If it's an image file
-        images = [Image.open(io.BytesIO(contents))]
-    ocr = get_ocr(lang=lang, use_gpu=use_gpu)
-    final_results = []
-    for image in images:
-        img2np = np.array(image)
-        result = ocr.ocr(img2np, cls=True)[0]
-        boxes = [line[0] for line in result]
-        txts = [line[1][0] for line in result]
-        scores = [line[1][1] for line in result]
-        # 识别结果
-        final_result = [dict(boxes=box, txt=txt, score=score) for box, txt, score in zip(boxes, txts, scores)]
-        final_results.extend(final_result)
-    return final_results
-@app.post("/ocr_table")
-async def create_upload_file_for_table(
-    file: UploadFile = File(...),
-    lang: LangEnum = LangEnum.ch,
-):
-    table_engine = PPStructure(show_log=True, table=True, lang=lang)
-    contents = await file.read()
-    # 计算文件内容的哈希值
-    file_hash = hashlib.sha256(contents).hexdigest()
-    # Determine if the uploaded file is a PDF or image
-    if file.content_type == "application/pdf":
-        images = pdf_to_images(contents)
-    else:
-        images = [Image.open(io.BytesIO(contents))]
-    final_htmls = []
-    final_bboxes = []
-    final_types = []
-    for image in images:
-        img2np = np.array(image)
-        result = table_engine(img2np)
-        save_structure_res(result, output_dir, f'{file_hash}')
-        for item in result:
-            item_res = item.get('res', {})
-            final_htmls.append(item_res.get('html', ''))
-            final_types.append(item.get('type', ''))
-            final_bboxes.append(item.get('bbox', ''))
-    return {
-        'htmls': final_htmls,
-        'hash': file_hash,
-        'bboxes': final_bboxes,
-        'types': final_types,
-    }
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")

 from fastapi.staticfiles import StaticFiles
 import hashlib
 from enum import Enum
+from fastapi import FastAPI, UploadFile, File, HTTPException
 from paddleocr import PaddleOCR, PPStructure, save_structure_res
 from PIL import Image
 import io
 import numpy as np
 import fitz  # PyMuPDF for PDF handling
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 app = FastAPI(docs_url='/')
 use_gpu = False
     ch = "ch"
     en = "en"
+# Cache with ocr
 ocr_cache = {}
+# Get OCR instance
 def get_ocr(lang, use_gpu=False):
     if not ocr_cache.get(lang):
         ocr_cache[lang] = PaddleOCR(use_angle_cls=True, lang=lang, use_gpu=use_gpu)
 # Function to extract images from PDF
 def pdf_to_images(file_contents):
+    try:
+        doc = fitz.open(io.BytesIO(file_contents))
+        images = []
+        for page in doc:
+            pix = page.get_pixmap()
+            img = Image.open(io.BytesIO(pix.tobytes("png")))
+            images.append(img)
+        return images
+    except Exception as e:
+        logger.error(f"Error processing PDF: {str(e)}")
+        raise HTTPException(status_code=500, detail="Error processing PDF file")
 @app.post("/ocr")
 async def create_upload_file(
     file: UploadFile = File(...),
     lang: LangEnum = LangEnum.ch,
 ):
+    try:
+        contents = await file.read()
+        # Determine if the uploaded file is a PDF or image
+        if file.content_type == "application/pdf":
+            images = pdf_to_images(contents)
+        else:
+            # If it's an image file
+            images = [Image.open(io.BytesIO(contents))]
+        ocr = get_ocr(lang=lang, use_gpu=use_gpu)
+        final_results = []
+        for image in images:
+            img2np = np.array(image)
+            result = ocr.ocr(img2np, cls=True)[0]
+            boxes = [line[0] for line in result]
+            txts = [line[1][0] for line in result]
+            scores = [line[1][1] for line in result]
+            # 识别结果
+            final_result = [dict(boxes=box, txt=txt, score=score) for box, txt, score in zip(boxes, txts, scores)]
+            final_results.extend(final_result)
+        return final_results
+    except Exception as e:
+        logger.error(f"Error processing file: {str(e)}")
+        raise HTTPException(status_code=500, detail="Internal server error while processing the file")
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")