bluewhale2025 commited on
Commit
9d14f12
ยท
1 Parent(s): 1601965

Fix imports and improve error handling in app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -51
app.py CHANGED
@@ -100,84 +100,61 @@ async def health_check():
100
  }
101
 
102
  async def process_document(file_path: str):
103
- """Process a document as a background task.
 
104
 
105
  Args:
106
- file_path: Path to the uploaded file
107
 
108
  Returns:
109
- dict: Processing results including status and metadata
110
  """
111
- logger.info(f"Starting document processing: {file_path}")
112
-
113
  try:
114
- # Verify file exists
115
- if not os.path.exists(file_path):
116
- error_msg = f"File not found: {file_path}"
117
- logger.error(error_msg)
118
- raise FileNotFoundError(error_msg)
119
-
120
- # Extract text from PDF
121
- logger.info(f"Extracting text from: {file_path}")
122
- extracted_data = pdf_extractor.extract_text(file_path)
123
 
124
- if not extracted_data or "text_by_page" not in extracted_data:
125
- error_msg = f"Failed to extract text from: {file_path}"
126
- logger.error(error_msg)
127
- raise ValueError(error_msg)
128
 
129
- # Combine text from all pages
130
  full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])
131
 
132
- if not full_text.strip():
133
- error_msg = f"No text content found in: {file_path}"
134
- logger.error(error_msg)
135
- raise ValueError(error_msg)
136
 
137
- # Generate summary
138
- logger.info(f"Generating summary for: {file_path}")
139
- try:
140
- summary_result = document_summarizer.summarize_text(full_text)
141
- except Exception as e:
142
- logger.error(f"Error during summarization: {str(e)}")
143
- summary_result = {"full_summary": "Summary generation failed", "key_points": []}
144
-
145
- # Add to vector store
146
- logger.info(f"Adding document to vector store: {file_path}")
147
  metadata = {
148
- "filename": os.path.basename(file_path),
149
- "total_pages": extracted_data.get("total_pages", 0),
150
  "summary": summary_result.get("full_summary", ""),
151
- "timestamp": extracted_data.get("timestamp", ""),
152
- "source": "upload"
153
  }
154
 
155
- try:
156
- vector_store.add_document(full_text, metadata)
157
- except Exception as e:
158
- logger.error(f"Failed to add document to vector store: {str(e)}")
159
- raise
160
 
161
- # Save processed data
162
- processed_path = None
163
- try:
164
- processed_path = pdf_extractor.save_extracted_text(
165
- {
166
  **extracted_data,
167
- "summary": summary_result["full_summary"],
168
- "chunk_summaries": summary_result["chunk_summaries"]
169
  },
170
  str(PROCESSED_DIR)
171
  )
 
172
 
173
  return {
174
  "status": "success",
175
  "processed_file": processed_path,
176
- "summary": summary_result["full_summary"]
177
  }
178
 
179
  except Exception as e:
180
- raise Exception(f"๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
 
 
181
 
182
  @app.post("/upload/pdf")
183
  async def upload_pdf(
 
100
  }
101
 
102
  async def process_document(file_path: str):
103
+ """
104
+ Process a document by extracting text, summarizing it, and adding to the vector store.
105
 
106
  Args:
107
+ file_path (str): Path to the file to process
108
 
109
  Returns:
110
+ dict: Processing results including status, processed file path, and summary
111
  """
 
 
112
  try:
113
+ logger.info(f"Processing document: {file_path}")
 
 
 
 
 
 
 
 
114
 
115
+ # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
116
+ extracted_data = pdf_extractor.extract_text(file_path)
117
+ logger.info(f"Extracted text from {len(extracted_data['text_by_page'])} pages")
 
118
 
119
+ # ์ „์ฒด ํ…์ŠคํŠธ ์ถ”์ถœ
120
  full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])
121
 
122
+ # ํ…์ŠคํŠธ ์š”์•ฝ
123
+ summary_result = document_summarizer.summarize_text(full_text)
124
+ logger.info("Document summarization completed")
 
125
 
126
+ # ๋ฒกํ„ฐ ์ €์žฅ์†Œ์— ์ถ”๊ฐ€
 
 
 
 
 
 
 
 
 
127
  metadata = {
128
+ "filename": extracted_data["filename"],
129
+ "total_pages": extracted_data["total_pages"],
130
  "summary": summary_result.get("full_summary", ""),
131
+ "timestamp": extracted_data.get("timestamp", "")
 
132
  }
133
 
134
+ vector_store.add_document(full_text, metadata)
135
+ logger.info("Document added to vector store")
 
 
 
136
 
137
+ # ์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ ์ €์žฅ
138
+ processed_path = pdf_extractor.save_extracted_text(
139
+ {
 
 
140
  **extracted_data,
141
+ "summary": summary_result.get("full_summary", ""),
142
+ "chunk_summaries": summary_result.get("chunk_summaries", [])
143
  },
144
  str(PROCESSED_DIR)
145
  )
146
+ logger.info(f"Processed data saved to {processed_path}")
147
 
148
  return {
149
  "status": "success",
150
  "processed_file": processed_path,
151
+ "summary": summary_result.get("full_summary", "")
152
  }
153
 
154
  except Exception as e:
155
+ error_msg = f"Error processing document: {str(e)}"
156
+ logger.error(error_msg, exc_info=True)
157
+ raise Exception(error_msg)
158
 
159
  @app.post("/upload/pdf")
160
  async def upload_pdf(