| |
| """ |
| Direct GDC PDF OCR without downloading - streams PDFs from GDC API |
| """ |
|
|
| from pathlib import Path |
| from tqdm import tqdm |
| import fitz |
| from paddleocr import PaddleOCR |
| from PIL import Image |
| import numpy as np |
| import requests |
| import io |
| import time |
|
|
| |
| OUTPUT_DIR = Path("paddleocr_results/text") |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| |
| ocr = PaddleOCR( |
| lang="en", |
| use_textline_orientation=True, |
| device="gpu" |
| ) |
|
|
|
|
| def download_pdf_to_memory(file_id): |
| """Download PDF from GDC API directly to memory.""" |
| url = f"https://api.gdc.cancer.gov/data/{file_id}" |
| |
| try: |
| response = requests.get(url, stream=True, timeout=60) |
| response.raise_for_status() |
| |
| |
| pdf_bytes = io.BytesIO() |
| for chunk in response.iter_content(chunk_size=1024*1024): |
| if chunk: |
| pdf_bytes.write(chunk) |
| |
| pdf_bytes.seek(0) |
| return pdf_bytes |
| |
| except Exception as e: |
| raise RuntimeError(f"Failed to download: {e}") |
|
|
|
|
| def pdf_to_images(pdf_bytes): |
| """Convert PDF bytes to PIL images.""" |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
| images = [] |
|
|
| for page in doc: |
| pix = page.get_pixmap(dpi=200) |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
| images.append(img) |
|
|
| doc.close() |
| return images |
|
|
|
|
| def ocr_pdf_from_gdc(file_id): |
| """Download PDF from GDC and perform OCR.""" |
| |
| pdf_bytes = download_pdf_to_memory(file_id) |
| |
| |
| images = pdf_to_images(pdf_bytes) |
| all_text = [] |
|
|
| for page_idx, img in enumerate(images, 1): |
| img_np = np.array(img) |
|
|
| result = ocr.ocr(img_np, cls=True) |
| if result and result[0]: |
| page_text = " ".join([line[1][0] for line in result[0]]) |
| all_text.append(f"\n--- Page {page_idx} ---\n{page_text}") |
|
|
| return "\n".join(all_text) |
|
|
|
|
| def main(): |
| |
| manifest_file = "/usr/users/3d_dimension_est/selva_sur/RAG/data/file_ids.txt" |
| |
| if not Path(manifest_file).exists(): |
| print(f" ERROR: Manifest file not found: {manifest_file}") |
| return |
| |
| with open(manifest_file, 'r') as f: |
| lines = f.readlines()[1:] |
| |
| file_ids = [line.split('\t')[0].strip() for line in lines if line.strip()] |
| |
| |
| |
| |
| print("\n" + "=" * 70) |
| print("PADDLEOCR – DIRECT GDC PDF OCR (NO DOWNLOAD)") |
| print("=" * 70) |
| print(f"Manifest file : {manifest_file}") |
| print(f"Output directory : {OUTPUT_DIR}") |
| print(f"Files to process : {len(file_ids):,}") |
| print(f"Device : GPU") |
| print("=" * 70 + "\n") |
|
|
| success, failed = 0, 0 |
| total_chars = 0 |
| failed_ids = [] |
|
|
| for file_id in tqdm(file_ids, desc="Processing PDFs", unit="files"): |
| try: |
| |
| text = ocr_pdf_from_gdc(file_id) |
|
|
| if not text.strip(): |
| raise RuntimeError("No text extracted") |
|
|
| |
| out_file = OUTPUT_DIR / f"{file_id}.txt" |
| out_file.write_text(text, encoding="utf-8") |
|
|
| total_chars += len(text) |
| success += 1 |
|
|
| except Exception as e: |
| failed += 1 |
| failed_ids.append(file_id) |
| tqdm.write(f" {file_id}: {e}") |
| |
| |
| time.sleep(0.1) |
|
|
| print("\n" + "=" * 70) |
| print("OCR SUMMARY") |
| print("=" * 70) |
| print(f"Total attempted : {len(file_ids):,}") |
| print(f"Successful : {success:,}") |
| print(f"Failed : {failed:,}") |
| print(f"Success rate : {100 * success / max(len(file_ids), 1):.1f}%") |
| print(f"Total text : {total_chars:,} chars") |
| print("=" * 70) |
| |
| |
| if failed_ids: |
| failed_file = OUTPUT_DIR / "failed_ocr_ids.txt" |
| with open(failed_file, 'w') as f: |
| f.write("file_id\n") |
| for fid in failed_ids: |
| f.write(f"{fid}\n") |
| print(f"\n Failed file IDs saved to: {failed_file}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|