Update app.py
Browse files
app.py
CHANGED
|
@@ -61,6 +61,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
|
|
| 61 |
with pdfplumber.open(file_path) as pdf:
|
| 62 |
total_pages = len(pdf.pages)
|
| 63 |
if total_pages == 0:
|
|
|
|
| 64 |
return ""
|
| 65 |
|
| 66 |
batch_size = 10
|
|
@@ -71,22 +72,28 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
|
|
| 71 |
def extract_batch(start: int, end: int) -> List[tuple]:
|
| 72 |
results = []
|
| 73 |
with pdfplumber.open(file_path) as pdf:
|
| 74 |
-
for page in pdf.pages[start:end]:
|
| 75 |
-
page_num = start + pdf.pages.index(page)
|
| 76 |
page_text = page.extract_text() or ""
|
| 77 |
-
results.append((
|
|
|
|
| 78 |
return results
|
| 79 |
|
| 80 |
with ThreadPoolExecutor(max_workers=6) as executor:
|
| 81 |
futures = [executor.submit(extract_batch, start, end) for start, end in batches]
|
| 82 |
for future in as_completed(futures):
|
| 83 |
for page_num, text in future.result():
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
| 85 |
processed_pages += batch_size
|
| 86 |
if progress_callback:
|
| 87 |
progress_callback(min(processed_pages, total_pages), total_pages)
|
|
|
|
| 88 |
|
| 89 |
-
|
|
|
|
|
|
|
| 90 |
except Exception as e:
|
| 91 |
logger.error("PDF processing error: %s", e)
|
| 92 |
return f"PDF processing error: {str(e)}"
|
|
@@ -96,6 +103,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
|
|
| 96 |
file_h = file_hash(file_path)
|
| 97 |
cache_key = f"{file_h}_{file_type}"
|
| 98 |
if cache_key in cache:
|
|
|
|
| 99 |
return cache[cache_key]
|
| 100 |
|
| 101 |
if file_type == "pdf":
|
|
@@ -117,6 +125,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
|
|
| 117 |
result = json.dumps({"error": f"Unsupported file type: {file_type}"})
|
| 118 |
|
| 119 |
cache[cache_key] = result
|
|
|
|
| 120 |
return result
|
| 121 |
except Exception as e:
|
| 122 |
logger.error("Error processing %s: %s", os.path.basename(file_path), e)
|
|
@@ -259,9 +268,11 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
| 259 |
|
| 260 |
history.append({"role": "assistant", "content": "✅ Text extraction complete."})
|
| 261 |
yield history, None, ""
|
|
|
|
| 262 |
|
| 263 |
chunk_size = 6000
|
| 264 |
chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
|
|
|
|
| 265 |
combined_response = ""
|
| 266 |
batch_size = 2
|
| 267 |
|
|
@@ -287,7 +298,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
| 287 |
if cleaned and re.search(r"###\s*\w+", cleaned):
|
| 288 |
chunk_response += cleaned + "\n\n"
|
| 289 |
elif isinstance(chunk_output, str) and chunk_output.strip():
|
| 290 |
-
cleaned = clean_response(
|
| 291 |
if cleaned and re.search(r"###\s*\w+", cleaned):
|
| 292 |
chunk_response += cleaned + "\n\n"
|
| 293 |
batch_responses.append(chunk_response)
|
|
|
|
| 61 |
with pdfplumber.open(file_path) as pdf:
|
| 62 |
total_pages = len(pdf.pages)
|
| 63 |
if total_pages == 0:
|
| 64 |
+
logger.error("No pages found in PDF")
|
| 65 |
return ""
|
| 66 |
|
| 67 |
batch_size = 10
|
|
|
|
| 72 |
def extract_batch(start: int, end: int) -> List[tuple]:
|
| 73 |
results = []
|
| 74 |
with pdfplumber.open(file_path) as pdf:
|
| 75 |
+
for idx, page in enumerate(pdf.pages[start:end], start=start):
|
|
|
|
| 76 |
page_text = page.extract_text() or ""
|
| 77 |
+
results.append((idx, f"=== Page {idx + 1} ===\n{page_text.strip()}"))
|
| 78 |
+
logger.debug("Extracted page %d, text length: %d chars", idx + 1, len(page_text))
|
| 79 |
return results
|
| 80 |
|
| 81 |
with ThreadPoolExecutor(max_workers=6) as executor:
|
| 82 |
futures = [executor.submit(extract_batch, start, end) for start, end in batches]
|
| 83 |
for future in as_completed(futures):
|
| 84 |
for page_num, text in future.result():
|
| 85 |
+
if page_num < len(text_chunks):
|
| 86 |
+
text_chunks[page_num] = text
|
| 87 |
+
else:
|
| 88 |
+
logger.warning("Page number %d out of range for text_chunks (size %d)", page_num, len(text_chunks))
|
| 89 |
processed_pages += batch_size
|
| 90 |
if progress_callback:
|
| 91 |
progress_callback(min(processed_pages, total_pages), total_pages)
|
| 92 |
+
logger.info("Processed %d/%d pages", min(processed_pages, total_pages), total_pages)
|
| 93 |
|
| 94 |
+
extracted_text = "\n\n".join(filter(None, text_chunks))
|
| 95 |
+
logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
|
| 96 |
+
return extracted_text
|
| 97 |
except Exception as e:
|
| 98 |
logger.error("PDF processing error: %s", e)
|
| 99 |
return f"PDF processing error: {str(e)}"
|
|
|
|
| 103 |
file_h = file_hash(file_path)
|
| 104 |
cache_key = f"{file_h}_{file_type}"
|
| 105 |
if cache_key in cache:
|
| 106 |
+
logger.info("Using cached extraction for %s", file_path)
|
| 107 |
return cache[cache_key]
|
| 108 |
|
| 109 |
if file_type == "pdf":
|
|
|
|
| 125 |
result = json.dumps({"error": f"Unsupported file type: {file_type}"})
|
| 126 |
|
| 127 |
cache[cache_key] = result
|
| 128 |
+
logger.info("Cached extraction for %s, size: %d bytes", file_path, len(result))
|
| 129 |
return result
|
| 130 |
except Exception as e:
|
| 131 |
logger.error("Error processing %s: %s", os.path.basename(file_path), e)
|
|
|
|
| 268 |
|
| 269 |
history.append({"role": "assistant", "content": "✅ Text extraction complete."})
|
| 270 |
yield history, None, ""
|
| 271 |
+
logger.info("Extracted text length: %d chars", len(extracted))
|
| 272 |
|
| 273 |
chunk_size = 6000
|
| 274 |
chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
|
| 275 |
+
logger.info("Created %d chunks", len(chunks))
|
| 276 |
combined_response = ""
|
| 277 |
batch_size = 2
|
| 278 |
|
|
|
|
| 298 |
if cleaned and re.search(r"###\s*\w+", cleaned):
|
| 299 |
chunk_response += cleaned + "\n\n"
|
| 300 |
elif isinstance(chunk_output, str) and chunk_output.strip():
|
| 301 |
+
cleaned = clean_response(chunk_output)
|
| 302 |
if cleaned and re.search(r"###\s*\w+", cleaned):
|
| 303 |
chunk_response += cleaned + "\n\n"
|
| 304 |
batch_responses.append(chunk_response)
|