Spaces:

suryaprakash01
/

rag

Runtime error

App Files Files Community

rag / src /oc.py

suryaprakash01

Upload 6790 files

82c35a1 verified about 1 month ago

raw

history blame contribute delete

4.35 kB

	#!/usr/bin/env python3
	"""
	Direct GDC PDF OCR without downloading - streams PDFs from GDC API
	"""

	from pathlib import Path
	from tqdm import tqdm
	import fitz # PyMuPDF
	from paddleocr import PaddleOCR
	from PIL import Image
	import numpy as np
	import requests
	import io
	import time

	# Paths
	OUTPUT_DIR = Path("paddleocr_results/text")
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	# Initialize OCR once
	ocr = PaddleOCR(
	lang="en",
	use_textline_orientation=True,
	device="gpu"
	)


	def download_pdf_to_memory(file_id):
	"""Download PDF from GDC API directly to memory."""
	url = f"https://api.gdc.cancer.gov/data/{file_id}"

	try:
	response = requests.get(url, stream=True, timeout=60)
	response.raise_for_status()

	# Read PDF into memory
	pdf_bytes = io.BytesIO()
	for chunk in response.iter_content(chunk_size=1024*1024):
	if chunk:
	pdf_bytes.write(chunk)

	pdf_bytes.seek(0)
	return pdf_bytes

	except Exception as e:
	raise RuntimeError(f"Failed to download: {e}")


	def pdf_to_images(pdf_bytes):
	"""Convert PDF bytes to PIL images."""
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	images = []

	for page in doc:
	pix = page.get_pixmap(dpi=200)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	images.append(img)

	doc.close()
	return images


	def ocr_pdf_from_gdc(file_id):
	"""Download PDF from GDC and perform OCR."""
	# Download PDF to memory
	pdf_bytes = download_pdf_to_memory(file_id)

	# Convert to images
	images = pdf_to_images(pdf_bytes)
	all_text = []

	for page_idx, img in enumerate(images, 1):
	img_np = np.array(img)

	result = ocr.ocr(img_np, cls=True)
	if result and result[0]:
	page_text = " ".join([line[1][0] for line in result[0]])
	all_text.append(f"\n--- Page {page_idx} ---\n{page_text}")

	return "\n".join(all_text)


	def main():
	# Read manifest file
	manifest_file = "/usr/users/3d_dimension_est/selva_sur/RAG/data/file_ids.txt"

	if not Path(manifest_file).exists():
	print(f" ERROR: Manifest file not found: {manifest_file}")
	return

	with open(manifest_file, 'r') as f:
	lines = f.readlines()[1:] # Skip header

	file_ids = [line.split('\t')[0].strip() for line in lines if line.strip()]

	# Process all files (or limit for testing)
	# file_ids = file_ids[:100] # Uncomment to test with first 100

	print("\n" + "=" * 70)
	print("PADDLEOCR – DIRECT GDC PDF OCR (NO DOWNLOAD)")
	print("=" * 70)
	print(f"Manifest file : {manifest_file}")
	print(f"Output directory : {OUTPUT_DIR}")
	print(f"Files to process : {len(file_ids):,}")
	print(f"Device : GPU")
	print("=" * 70 + "\n")

	success, failed = 0, 0
	total_chars = 0
	failed_ids = []

	for file_id in tqdm(file_ids, desc="Processing PDFs", unit="files"):
	try:
	# OCR the PDF directly from GDC
	text = ocr_pdf_from_gdc(file_id)

	if not text.strip():
	raise RuntimeError("No text extracted")

	# Save output
	out_file = OUTPUT_DIR / f"{file_id}.txt"
	out_file.write_text(text, encoding="utf-8")

	total_chars += len(text)
	success += 1

	except Exception as e:
	failed += 1
	failed_ids.append(file_id)
	tqdm.write(f" {file_id}: {e}")

	# Small delay to avoid hammering the API
	time.sleep(0.1)

	print("\n" + "=" * 70)
	print("OCR SUMMARY")
	print("=" * 70)
	print(f"Total attempted : {len(file_ids):,}")
	print(f"Successful : {success:,}")
	print(f"Failed : {failed:,}")
	print(f"Success rate : {100 * success / max(len(file_ids), 1):.1f}%")
	print(f"Total text : {total_chars:,} chars")
	print("=" * 70)

	# Save failed IDs
	if failed_ids:
	failed_file = OUTPUT_DIR / "failed_ocr_ids.txt"
	with open(failed_file, 'w') as f:
	f.write("file_id\n")
	for fid in failed_ids:
	f.write(f"{fid}\n")
	print(f"\n Failed file IDs saved to: {failed_file}")


	if __name__ == "__main__":
	main()