Spaces:

bluewhale2025
/

parseai-document-processor

Build error

App Files Files Community

bluewhale2025 commited on May 23, 2025

Commit

9d14f12

1 Parent(s): 1601965

Fix imports and improve error handling in app.py

Browse files

Files changed (1) hide show

app.py +28 -51

app.py CHANGED Viewed

@@ -100,84 +100,61 @@ async def health_check():
     }
 async def process_document(file_path: str):
-    """Process a document as a background task.
     Args:
-        file_path: Path to the uploaded file
     Returns:
-        dict: Processing results including status and metadata
     """
-    logger.info(f"Starting document processing: {file_path}")
     try:
-        # Verify file exists
-        if not os.path.exists(file_path):
-            error_msg = f"File not found: {file_path}"
-            logger.error(error_msg)
-            raise FileNotFoundError(error_msg)
-        # Extract text from PDF
-        logger.info(f"Extracting text from: {file_path}")
-        extracted_data = pdf_extractor.extract_text(file_path)
-        if not extracted_data or "text_by_page" not in extracted_data:
-            error_msg = f"Failed to extract text from: {file_path}"
-            logger.error(error_msg)
-            raise ValueError(error_msg)
-        # Combine text from all pages
         full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])
-        if not full_text.strip():
-            error_msg = f"No text content found in: {file_path}"
-            logger.error(error_msg)
-            raise ValueError(error_msg)
-        # Generate summary
-        logger.info(f"Generating summary for: {file_path}")
-        try:
-            summary_result = document_summarizer.summarize_text(full_text)
-        except Exception as e:
-            logger.error(f"Error during summarization: {str(e)}")
-            summary_result = {"full_summary": "Summary generation failed", "key_points": []}
-        # Add to vector store
-        logger.info(f"Adding document to vector store: {file_path}")
         metadata = {
-            "filename": os.path.basename(file_path),
-            "total_pages": extracted_data.get("total_pages", 0),
             "summary": summary_result.get("full_summary", ""),
-            "timestamp": extracted_data.get("timestamp", ""),
-            "source": "upload"
         }
-        try:
-            vector_store.add_document(full_text, metadata)
-        except Exception as e:
-            logger.error(f"Failed to add document to vector store: {str(e)}")
-            raise
-        # Save processed data
-        processed_path = None
-        try:
-            processed_path = pdf_extractor.save_extracted_text(
-                {
                 **extracted_data,
-                "summary": summary_result["full_summary"],
-                "chunk_summaries": summary_result["chunk_summaries"]
             },
             str(PROCESSED_DIR)
         )
         return {
             "status": "success",
             "processed_file": processed_path,
-            "summary": summary_result["full_summary"]
         }
     except Exception as e:
-        raise Exception(f"문서 처리 중 오류 발생: {str(e)}")
 @app.post("/upload/pdf")
 async def upload_pdf(

     }
 async def process_document(file_path: str):
+    """
+    Process a document by extracting text, summarizing it, and adding to the vector store.
     Args:
+        file_path (str): Path to the file to process
     Returns:
+        dict: Processing results including status, processed file path, and summary
     """
     try:
+        logger.info(f"Processing document: {file_path}")
+        # PDF 텍스트 추출
+        extracted_data = pdf_extractor.extract_text(file_path)
+        logger.info(f"Extracted text from {len(extracted_data['text_by_page'])} pages")
+        # 전체 텍스트 추출
         full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])
+        # 텍스트 요약
+        summary_result = document_summarizer.summarize_text(full_text)
+        logger.info("Document summarization completed")
+        # 벡터 저장소에 추가
         metadata = {
+            "filename": extracted_data["filename"],
+            "total_pages": extracted_data["total_pages"],
             "summary": summary_result.get("full_summary", ""),
+            "timestamp": extracted_data.get("timestamp", "")
         }
+        vector_store.add_document(full_text, metadata)
+        logger.info("Document added to vector store")
+        # 처리된 데이터 저장
+        processed_path = pdf_extractor.save_extracted_text(
+            {
                 **extracted_data,
+                "summary": summary_result.get("full_summary", ""),
+                "chunk_summaries": summary_result.get("chunk_summaries", [])
             },
             str(PROCESSED_DIR)
         )
+        logger.info(f"Processed data saved to {processed_path}")
         return {
             "status": "success",
             "processed_file": processed_path,
+            "summary": summary_result.get("full_summary", "")
         }
     except Exception as e:
+        error_msg = f"Error processing document: {str(e)}"
+        logger.error(error_msg, exc_info=True)
+        raise Exception(error_msg)
 @app.post("/upload/pdf")
 async def upload_pdf(