Spaces:
Build error
Build error
Commit
ยท
9d14f12
1
Parent(s):
1601965
Fix imports and improve error handling in app.py
Browse files
app.py
CHANGED
|
@@ -100,84 +100,61 @@ async def health_check():
|
|
| 100 |
}
|
| 101 |
|
| 102 |
async def process_document(file_path: str):
|
| 103 |
-
"""
|
|
|
|
| 104 |
|
| 105 |
Args:
|
| 106 |
-
file_path: Path to the
|
| 107 |
|
| 108 |
Returns:
|
| 109 |
-
dict: Processing results including status and
|
| 110 |
"""
|
| 111 |
-
logger.info(f"Starting document processing: {file_path}")
|
| 112 |
-
|
| 113 |
try:
|
| 114 |
-
|
| 115 |
-
if not os.path.exists(file_path):
|
| 116 |
-
error_msg = f"File not found: {file_path}"
|
| 117 |
-
logger.error(error_msg)
|
| 118 |
-
raise FileNotFoundError(error_msg)
|
| 119 |
-
|
| 120 |
-
# Extract text from PDF
|
| 121 |
-
logger.info(f"Extracting text from: {file_path}")
|
| 122 |
-
extracted_data = pdf_extractor.extract_text(file_path)
|
| 123 |
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
raise ValueError(error_msg)
|
| 128 |
|
| 129 |
-
#
|
| 130 |
full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
raise ValueError(error_msg)
|
| 136 |
|
| 137 |
-
#
|
| 138 |
-
logger.info(f"Generating summary for: {file_path}")
|
| 139 |
-
try:
|
| 140 |
-
summary_result = document_summarizer.summarize_text(full_text)
|
| 141 |
-
except Exception as e:
|
| 142 |
-
logger.error(f"Error during summarization: {str(e)}")
|
| 143 |
-
summary_result = {"full_summary": "Summary generation failed", "key_points": []}
|
| 144 |
-
|
| 145 |
-
# Add to vector store
|
| 146 |
-
logger.info(f"Adding document to vector store: {file_path}")
|
| 147 |
metadata = {
|
| 148 |
-
"filename":
|
| 149 |
-
"total_pages": extracted_data
|
| 150 |
"summary": summary_result.get("full_summary", ""),
|
| 151 |
-
"timestamp": extracted_data.get("timestamp", "")
|
| 152 |
-
"source": "upload"
|
| 153 |
}
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
except Exception as e:
|
| 158 |
-
logger.error(f"Failed to add document to vector store: {str(e)}")
|
| 159 |
-
raise
|
| 160 |
|
| 161 |
-
#
|
| 162 |
-
processed_path =
|
| 163 |
-
|
| 164 |
-
processed_path = pdf_extractor.save_extracted_text(
|
| 165 |
-
{
|
| 166 |
**extracted_data,
|
| 167 |
-
"summary": summary_result
|
| 168 |
-
"chunk_summaries": summary_result
|
| 169 |
},
|
| 170 |
str(PROCESSED_DIR)
|
| 171 |
)
|
|
|
|
| 172 |
|
| 173 |
return {
|
| 174 |
"status": "success",
|
| 175 |
"processed_file": processed_path,
|
| 176 |
-
"summary": summary_result
|
| 177 |
}
|
| 178 |
|
| 179 |
except Exception as e:
|
| 180 |
-
|
|
|
|
|
|
|
| 181 |
|
| 182 |
@app.post("/upload/pdf")
|
| 183 |
async def upload_pdf(
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
async def process_document(file_path: str):
|
| 103 |
+
"""
|
| 104 |
+
Process a document by extracting text, summarizing it, and adding to the vector store.
|
| 105 |
|
| 106 |
Args:
|
| 107 |
+
file_path (str): Path to the file to process
|
| 108 |
|
| 109 |
Returns:
|
| 110 |
+
dict: Processing results including status, processed file path, and summary
|
| 111 |
"""
|
|
|
|
|
|
|
| 112 |
try:
|
| 113 |
+
logger.info(f"Processing document: {file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
# PDF ํ
์คํธ ์ถ์ถ
|
| 116 |
+
extracted_data = pdf_extractor.extract_text(file_path)
|
| 117 |
+
logger.info(f"Extracted text from {len(extracted_data['text_by_page'])} pages")
|
|
|
|
| 118 |
|
| 119 |
+
# ์ ์ฒด ํ
์คํธ ์ถ์ถ
|
| 120 |
full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])
|
| 121 |
|
| 122 |
+
# ํ
์คํธ ์์ฝ
|
| 123 |
+
summary_result = document_summarizer.summarize_text(full_text)
|
| 124 |
+
logger.info("Document summarization completed")
|
|
|
|
| 125 |
|
| 126 |
+
# ๋ฒกํฐ ์ ์ฅ์์ ์ถ๊ฐ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
metadata = {
|
| 128 |
+
"filename": extracted_data["filename"],
|
| 129 |
+
"total_pages": extracted_data["total_pages"],
|
| 130 |
"summary": summary_result.get("full_summary", ""),
|
| 131 |
+
"timestamp": extracted_data.get("timestamp", "")
|
|
|
|
| 132 |
}
|
| 133 |
|
| 134 |
+
vector_store.add_document(full_text, metadata)
|
| 135 |
+
logger.info("Document added to vector store")
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
# ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ ์ ์ฅ
|
| 138 |
+
processed_path = pdf_extractor.save_extracted_text(
|
| 139 |
+
{
|
|
|
|
|
|
|
| 140 |
**extracted_data,
|
| 141 |
+
"summary": summary_result.get("full_summary", ""),
|
| 142 |
+
"chunk_summaries": summary_result.get("chunk_summaries", [])
|
| 143 |
},
|
| 144 |
str(PROCESSED_DIR)
|
| 145 |
)
|
| 146 |
+
logger.info(f"Processed data saved to {processed_path}")
|
| 147 |
|
| 148 |
return {
|
| 149 |
"status": "success",
|
| 150 |
"processed_file": processed_path,
|
| 151 |
+
"summary": summary_result.get("full_summary", "")
|
| 152 |
}
|
| 153 |
|
| 154 |
except Exception as e:
|
| 155 |
+
error_msg = f"Error processing document: {str(e)}"
|
| 156 |
+
logger.error(error_msg, exc_info=True)
|
| 157 |
+
raise Exception(error_msg)
|
| 158 |
|
| 159 |
@app.post("/upload/pdf")
|
| 160 |
async def upload_pdf(
|