#!/usr/bin/env python3 """ Direct GDC PDF OCR without downloading - streams PDFs from GDC API """ from pathlib import Path from tqdm import tqdm import fitz # PyMuPDF from paddleocr import PaddleOCR from PIL import Image import numpy as np import requests import io import time # Paths OUTPUT_DIR = Path("paddleocr_results/text") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Initialize OCR once ocr = PaddleOCR( lang="en", use_textline_orientation=True, device="gpu" ) def download_pdf_to_memory(file_id): """Download PDF from GDC API directly to memory.""" url = f"https://api.gdc.cancer.gov/data/{file_id}" try: response = requests.get(url, stream=True, timeout=60) response.raise_for_status() # Read PDF into memory pdf_bytes = io.BytesIO() for chunk in response.iter_content(chunk_size=1024*1024): if chunk: pdf_bytes.write(chunk) pdf_bytes.seek(0) return pdf_bytes except Exception as e: raise RuntimeError(f"Failed to download: {e}") def pdf_to_images(pdf_bytes): """Convert PDF bytes to PIL images.""" doc = fitz.open(stream=pdf_bytes, filetype="pdf") images = [] for page in doc: pix = page.get_pixmap(dpi=200) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) doc.close() return images def ocr_pdf_from_gdc(file_id): """Download PDF from GDC and perform OCR.""" # Download PDF to memory pdf_bytes = download_pdf_to_memory(file_id) # Convert to images images = pdf_to_images(pdf_bytes) all_text = [] for page_idx, img in enumerate(images, 1): img_np = np.array(img) result = ocr.ocr(img_np, cls=True) if result and result[0]: page_text = " ".join([line[1][0] for line in result[0]]) all_text.append(f"\n--- Page {page_idx} ---\n{page_text}") return "\n".join(all_text) def main(): # Read manifest file manifest_file = "/usr/users/3d_dimension_est/selva_sur/RAG/data/file_ids.txt" if not Path(manifest_file).exists(): print(f" ERROR: Manifest file not found: {manifest_file}") return with open(manifest_file, 'r') as f: lines = f.readlines()[1:] # Skip header file_ids = [line.split('\t')[0].strip() for line in lines if line.strip()] # Process all files (or limit for testing) # file_ids = file_ids[:100] # Uncomment to test with first 100 print("\n" + "=" * 70) print("PADDLEOCR – DIRECT GDC PDF OCR (NO DOWNLOAD)") print("=" * 70) print(f"Manifest file : {manifest_file}") print(f"Output directory : {OUTPUT_DIR}") print(f"Files to process : {len(file_ids):,}") print(f"Device : GPU") print("=" * 70 + "\n") success, failed = 0, 0 total_chars = 0 failed_ids = [] for file_id in tqdm(file_ids, desc="Processing PDFs", unit="files"): try: # OCR the PDF directly from GDC text = ocr_pdf_from_gdc(file_id) if not text.strip(): raise RuntimeError("No text extracted") # Save output out_file = OUTPUT_DIR / f"{file_id}.txt" out_file.write_text(text, encoding="utf-8") total_chars += len(text) success += 1 except Exception as e: failed += 1 failed_ids.append(file_id) tqdm.write(f" {file_id}: {e}") # Small delay to avoid hammering the API time.sleep(0.1) print("\n" + "=" * 70) print("OCR SUMMARY") print("=" * 70) print(f"Total attempted : {len(file_ids):,}") print(f"Successful : {success:,}") print(f"Failed : {failed:,}") print(f"Success rate : {100 * success / max(len(file_ids), 1):.1f}%") print(f"Total text : {total_chars:,} chars") print("=" * 70) # Save failed IDs if failed_ids: failed_file = OUTPUT_DIR / "failed_ocr_ids.txt" with open(failed_file, 'w') as f: f.write("file_id\n") for fid in failed_ids: f.write(f"{fid}\n") print(f"\n Failed file IDs saved to: {failed_file}") if __name__ == "__main__": main()