document-extraction

Sleeping

App Files Files Community

kmuthudurai commited on Dec 11, 2024

Commit

39fc86b

verified ·

1 Parent(s): b43ecb8

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -30

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from paddleocr import PaddleOCR, PPStructure, save_structure_res
 from PIL import Image
 import io
 import numpy as np
 app = FastAPI(docs_url='/')
 use_gpu = False
@@ -19,40 +20,59 @@ class LangEnum(str, Enum):
 # cache with ocr
 ocr_cache = {}
-# get ocr ins
 def get_ocr(lang, use_gpu=False):
     if not ocr_cache.get(lang):
         ocr_cache[lang] = PaddleOCR(use_angle_cls=True, lang=lang, use_gpu=use_gpu)
     return ocr_cache.get(lang)
 @app.post("/ocr")
 async def create_upload_file(
     file: UploadFile = File(...),
     lang: LangEnum = LangEnum.ch,
-    # use_gpu: bool = False
 ):
     contents = await file.read()
-    image = Image.open(io.BytesIO(contents))
     ocr = get_ocr(lang=lang, use_gpu=use_gpu)
-    img2np = np.array(image)
-    result = ocr.ocr(img2np, cls=True)[0]
-    boxes = [line[0] for line in result]
-    txts = [line[1][0] for line in result]
-    scores = [line[1][1] for line in result]
-    # 识别结果
-    final_result = [dict(boxes=box, txt=txt, score=score) for box, txt, score in zip(boxes, txts, scores)]
-    return final_result
 @app.post("/ocr_table")
-async def create_upload_file(
     file: UploadFile = File(...),
     lang: LangEnum = LangEnum.ch,
-    # use_gpu: bool = False
 ):
     table_engine = PPStructure(show_log=True, table=True, lang=lang)
@@ -60,30 +80,37 @@ async def create_upload_file(
     # 计算文件内容的哈希值
     file_hash = hashlib.sha256(contents).hexdigest()
-    image = Image.open(io.BytesIO(contents))
-    img2np = np.array(image)
-    result = table_engine(img2np)
-    save_structure_res(result, output_dir, f'{file_hash}')
-    htmls = []
-    types = []
-    bboxes = []
-    for item in result:
-        item_res = item.get('res', {})
-        htmls.append(item_res.get('html', ''))
-        types.append(item.get('type', ''))
-        bboxes.append(item.get('bbox', ''))
     return {
-        'htmls': htmls,
         'hash': file_hash,
-        'bboxes': bboxes,
-        'types': types,
     }
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
 if __name__ == '__main__':
-    uvicorn.run(app=app)

 from PIL import Image
 import io
 import numpy as np
+import fitz  # PyMuPDF for PDF handling
 app = FastAPI(docs_url='/')
 use_gpu = False
 # cache with ocr
 ocr_cache = {}
+# get ocr instance
 def get_ocr(lang, use_gpu=False):
     if not ocr_cache.get(lang):
         ocr_cache[lang] = PaddleOCR(use_angle_cls=True, lang=lang, use_gpu=use_gpu)
     return ocr_cache.get(lang)
+# Function to extract images from PDF
+def pdf_to_images(file_contents):
+    doc = fitz.open(io.BytesIO(file_contents))
+    images = []
+    for page in doc:
+        pix = page.get_pixmap()
+        img = Image.open(io.BytesIO(pix.tobytes("png")))
+        images.append(img)
+    return images
 @app.post("/ocr")
 async def create_upload_file(
     file: UploadFile = File(...),
     lang: LangEnum = LangEnum.ch,
 ):
     contents = await file.read()
+    # Determine if the uploaded file is a PDF or image
+    if file.content_type == "application/pdf":
+        images = pdf_to_images(contents)
+    else:
+        # If it's an image file
+        images = [Image.open(io.BytesIO(contents))]
     ocr = get_ocr(lang=lang, use_gpu=use_gpu)
+    final_results = []
+    for image in images:
+        img2np = np.array(image)
+        result = ocr.ocr(img2np, cls=True)[0]
+        boxes = [line[0] for line in result]
+        txts = [line[1][0] for line in result]
+        scores = [line[1][1] for line in result]
+        # 识别结果
+        final_result = [dict(boxes=box, txt=txt, score=score) for box, txt, score in zip(boxes, txts, scores)]
+        final_results.extend(final_result)
+    return final_results
 @app.post("/ocr_table")
+async def create_upload_file_for_table(
     file: UploadFile = File(...),
     lang: LangEnum = LangEnum.ch,
 ):
     table_engine = PPStructure(show_log=True, table=True, lang=lang)
     # 计算文件内容的哈希值
     file_hash = hashlib.sha256(contents).hexdigest()
+    # Determine if the uploaded file is a PDF or image
+    if file.content_type == "application/pdf":
+        images = pdf_to_images(contents)
+    else:
+        images = [Image.open(io.BytesIO(contents))]
+    final_htmls = []
+    final_bboxes = []
+    final_types = []
+    for image in images:
+        img2np = np.array(image)
+        result = table_engine(img2np)
+        save_structure_res(result, output_dir, f'{file_hash}')
+        for item in result:
+            item_res = item.get('res', {})
+            final_htmls.append(item_res.get('html', ''))
+            final_types.append(item.get('type', ''))
+            final_bboxes.append(item.get('bbox', ''))
     return {
+        'htmls': final_htmls,
         'hash': file_hash,
+        'bboxes': final_bboxes,
+        'types': final_types,
     }
+# Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
 if __name__ == '__main__':
+    uvicorn.run(app=app)