document-extraction

Sleeping

vkumartr commited on Feb 5, 2025

Commit

010967e

verified ·

1 Parent(s): b15de05

Changes updated

Files changed (1) hide show

app.py CHANGED Viewed

@@ -238,11 +238,7 @@ def ocr_from_s3(
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
-    """
-    Extract text from a PDF or Image stored in S3 and process it based on document size.
-    If more than 2 pages, skip Base64 conversion and summarization.
-    Store extracted data in MongoDB.
-    """
     try:
         # Fetch file from S3
         file_data, content_type = fetch_file_from_s3_file(file_key)
@@ -278,7 +274,6 @@ def ocr_from_s3(
             if num_pages <= 2:
                 full_text = " ".join(extracted_text)
                 summary = summarize_text(full_text)
         else:
             return {"error": f"Unsupported file type: {content_type}"}

     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
+    """Extract text from a PDF or Image stored in S3 and process it based on document size."""
     try:
         # Fetch file from S3
         file_data, content_type = fetch_file_from_s3_file(file_key)
             if num_pages <= 2:
                 full_text = " ".join(extracted_text)
                 summary = summarize_text(full_text)
         else:
             return {"error": f"Unsupported file type: {content_type}"}