suryaprakash01's picture
Upload 9 files
006e0a7 verified
#!/usr/bin/env python3
"""
Direct GDC PDF OCR without downloading - streams PDFs from GDC API
"""
from pathlib import Path
from tqdm import tqdm
import fitz # PyMuPDF
from paddleocr import PaddleOCR
from PIL import Image
import numpy as np
import requests
import io
import time
# Paths
OUTPUT_DIR = Path("paddleocr_results/text")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Initialize OCR once
ocr = PaddleOCR(
lang="en",
use_textline_orientation=True,
device="gpu"
)
def download_pdf_to_memory(file_id):
"""Download PDF from GDC API directly to memory."""
url = f"https://api.gdc.cancer.gov/data/{file_id}"
try:
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
# Read PDF into memory
pdf_bytes = io.BytesIO()
for chunk in response.iter_content(chunk_size=1024*1024):
if chunk:
pdf_bytes.write(chunk)
pdf_bytes.seek(0)
return pdf_bytes
except Exception as e:
raise RuntimeError(f"Failed to download: {e}")
def pdf_to_images(pdf_bytes):
"""Convert PDF bytes to PIL images."""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
for page in doc:
pix = page.get_pixmap(dpi=200)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
doc.close()
return images
def ocr_pdf_from_gdc(file_id):
"""Download PDF from GDC and perform OCR."""
# Download PDF to memory
pdf_bytes = download_pdf_to_memory(file_id)
# Convert to images
images = pdf_to_images(pdf_bytes)
all_text = []
for page_idx, img in enumerate(images, 1):
img_np = np.array(img)
result = ocr.ocr(img_np, cls=True)
if result and result[0]:
page_text = " ".join([line[1][0] for line in result[0]])
all_text.append(f"\n--- Page {page_idx} ---\n{page_text}")
return "\n".join(all_text)
def main():
# Read manifest file
manifest_file = "/usr/users/3d_dimension_est/selva_sur/RAG/data/file_ids.txt"
if not Path(manifest_file).exists():
print(f" ERROR: Manifest file not found: {manifest_file}")
return
with open(manifest_file, 'r') as f:
lines = f.readlines()[1:] # Skip header
file_ids = [line.split('\t')[0].strip() for line in lines if line.strip()]
# Process all files (or limit for testing)
# file_ids = file_ids[:100] # Uncomment to test with first 100
print("\n" + "=" * 70)
print("PADDLEOCR – DIRECT GDC PDF OCR (NO DOWNLOAD)")
print("=" * 70)
print(f"Manifest file : {manifest_file}")
print(f"Output directory : {OUTPUT_DIR}")
print(f"Files to process : {len(file_ids):,}")
print(f"Device : GPU")
print("=" * 70 + "\n")
success, failed = 0, 0
total_chars = 0
failed_ids = []
for file_id in tqdm(file_ids, desc="Processing PDFs", unit="files"):
try:
# OCR the PDF directly from GDC
text = ocr_pdf_from_gdc(file_id)
if not text.strip():
raise RuntimeError("No text extracted")
# Save output
out_file = OUTPUT_DIR / f"{file_id}.txt"
out_file.write_text(text, encoding="utf-8")
total_chars += len(text)
success += 1
except Exception as e:
failed += 1
failed_ids.append(file_id)
tqdm.write(f" {file_id}: {e}")
# Small delay to avoid hammering the API
time.sleep(0.1)
print("\n" + "=" * 70)
print("OCR SUMMARY")
print("=" * 70)
print(f"Total attempted : {len(file_ids):,}")
print(f"Successful : {success:,}")
print(f"Failed : {failed:,}")
print(f"Success rate : {100 * success / max(len(file_ids), 1):.1f}%")
print(f"Total text : {total_chars:,} chars")
print("=" * 70)
# Save failed IDs
if failed_ids:
failed_file = OUTPUT_DIR / "failed_ocr_ids.txt"
with open(failed_file, 'w') as f:
f.write("file_id\n")
for fid in failed_ids:
f.write(f"{fid}\n")
print(f"\n Failed file IDs saved to: {failed_file}")
if __name__ == "__main__":
main()