Spaces:

RocketFarmStudios
/

CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 20, 2025

Commit

62739d0

verified ·

1 Parent(s): aba9ae9

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -6

app.py CHANGED Viewed

@@ -61,6 +61,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
         with pdfplumber.open(file_path) as pdf:
             total_pages = len(pdf.pages)
             if total_pages == 0:
                 return ""
         batch_size = 10
@@ -71,22 +72,28 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
         def extract_batch(start: int, end: int) -> List[tuple]:
             results = []
             with pdfplumber.open(file_path) as pdf:
-                for page in pdf.pages[start:end]:
-                    page_num = start + pdf.pages.index(page)
                     page_text = page.extract_text() or ""
-                    results.append((page_num, f"=== Page {page_num + 1} ===\n{page_text.strip()}"))
             return results
         with ThreadPoolExecutor(max_workers=6) as executor:
             futures = [executor.submit(extract_batch, start, end) for start, end in batches]
             for future in as_completed(futures):
                 for page_num, text in future.result():
-                    text_chunks[page_num] = text
                 processed_pages += batch_size
                 if progress_callback:
                     progress_callback(min(processed_pages, total_pages), total_pages)
-        return "\n\n".join(filter(None, text_chunks))
     except Exception as e:
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
@@ -96,6 +103,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
         file_h = file_hash(file_path)
         cache_key = f"{file_h}_{file_type}"
         if cache_key in cache:
             return cache[cache_key]
         if file_type == "pdf":
@@ -117,6 +125,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
         cache[cache_key] = result
         return result
     except Exception as e:
         logger.error("Error processing %s: %s", os.path.basename(file_path), e)
@@ -259,9 +268,11 @@ Patient Record Excerpt (Chunk {0} of {1}):
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
                 yield history, None, ""
             chunk_size = 6000
             chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
             combined_response = ""
             batch_size = 2
@@ -287,7 +298,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
                                             if cleaned and re.search(r"###\s*\w+", cleaned):
                                                 chunk_response += cleaned + "\n\n"
                                 elif isinstance(chunk_output, str) and chunk_output.strip():
-                                    cleaned = clean_response(m.content)
                                     if cleaned and re.search(r"###\s*\w+", cleaned):
                                         chunk_response += cleaned + "\n\n"
                             batch_responses.append(chunk_response)

         with pdfplumber.open(file_path) as pdf:
             total_pages = len(pdf.pages)
             if total_pages == 0:
+                logger.error("No pages found in PDF")
                 return ""
         batch_size = 10
         def extract_batch(start: int, end: int) -> List[tuple]:
             results = []
             with pdfplumber.open(file_path) as pdf:
+                for idx, page in enumerate(pdf.pages[start:end], start=start):
                     page_text = page.extract_text() or ""
+                    results.append((idx, f"=== Page {idx + 1} ===\n{page_text.strip()}"))
+                    logger.debug("Extracted page %d, text length: %d chars", idx + 1, len(page_text))
             return results
         with ThreadPoolExecutor(max_workers=6) as executor:
             futures = [executor.submit(extract_batch, start, end) for start, end in batches]
             for future in as_completed(futures):
                 for page_num, text in future.result():
+                    if page_num < len(text_chunks):
+                        text_chunks[page_num] = text
+                    else:
+                        logger.warning("Page number %d out of range for text_chunks (size %d)", page_num, len(text_chunks))
                 processed_pages += batch_size
                 if progress_callback:
                     progress_callback(min(processed_pages, total_pages), total_pages)
+                logger.info("Processed %d/%d pages", min(processed_pages, total_pages), total_pages)
+        extracted_text = "\n\n".join(filter(None, text_chunks))
+        logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
+        return extracted_text
     except Exception as e:
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
         file_h = file_hash(file_path)
         cache_key = f"{file_h}_{file_type}"
         if cache_key in cache:
+            logger.info("Using cached extraction for %s", file_path)
             return cache[cache_key]
         if file_type == "pdf":
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
         cache[cache_key] = result
+        logger.info("Cached extraction for %s, size: %d bytes", file_path, len(result))
         return result
     except Exception as e:
         logger.error("Error processing %s: %s", os.path.basename(file_path), e)
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
                 yield history, None, ""
+                logger.info("Extracted text length: %d chars", len(extracted))
             chunk_size = 6000
             chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
+            logger.info("Created %d chunks", len(chunks))
             combined_response = ""
             batch_size = 2
                                             if cleaned and re.search(r"###\s*\w+", cleaned):
                                                 chunk_response += cleaned + "\n\n"
                                 elif isinstance(chunk_output, str) and chunk_output.strip():
+                                    cleaned = clean_response(chunk_output)
                                     if cleaned and re.search(r"###\s*\w+", cleaned):
                                         chunk_response += cleaned + "\n\n"
                             batch_responses.append(chunk_response)