Spaces:

suryaprakash01
/

rag

Runtime error

App Files Files Community

rag / src /embeddings.py

suryaprakash01

Upload 6790 files

82c35a1 verified about 1 month ago

raw

history blame contribute delete

8.07 kB

	#!/usr/bin/env python3
	"""
	Biomedical NLP Pipeline - Using Microsoft BiomedBERT

	"""

	from pathlib import Path
	import json
	from datetime import datetime
	from typing import List, Dict
	from tqdm import tqdm

	# Microsoft BiomedBERT
	from sentence_transformers import SentenceTransformer

	# spaCy for NER
	try:
	import spacy
	SPACY_AVAILABLE = True
	except ImportError:
	SPACY_AVAILABLE = False
	print("spaCy not available. Install: pip install spacy scispacy")


	class BiomedBERTPipeline:
	"""
	Pipeline using Microsoft BiomedBERT embeddings + spaCy NER
	"""

	def __init__(self, biomedbert_model: str = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"):
	"""
	Initialize with Microsoft BiomedBERT

	Args:
	biomedbert_model: HuggingFace model name
	"""
	print(f" Loading Microsoft BiomedBERT: {biomedbert_model}")
	print(" (First run downloads ~400MB, then cached)")

	self.embedder = SentenceTransformer(biomedbert_model)

	print(f" BiomedBERT loaded (embedding dim: {self.embedder.get_sentence_embedding_dimension()})")

	# Load spaCy medical model
	if SPACY_AVAILABLE:
	print(" Loading medical spaCy model...")
	try:
	# Try medical model first
	self.nlp = spacy.load("en_core_sci_md")
	print("Medical spaCy model (en_core_sci_md) loaded")
	except:
	try:
	# Fallback to general model
	self.nlp = spacy.load("en_core_web_sm")
	print(" General spaCy model (en_core_web_sm) loaded")
	except:
	print(" No spaCy model found. Running without NER.")
	self.nlp = None
	else:
	self.nlp = None

	def process_text(self, text: str) -> Dict:
	"""
	Process text with BiomedBERT embeddings + NER

	Args:
	text: Input text

	Returns:
	Dict with embeddings and entities
	"""
	result = {
	"timestamp": datetime.now().isoformat(),
	"embeddings": None,
	"entities": []
	}

	# Generate embeddings with BiomedBERT
	embedding = self.embedder.encode(text, convert_to_numpy=True)
	result["embeddings"] = embedding.tolist()

	# Extract entities with spaCy
	if self.nlp:
	doc = self.nlp(text)
	for ent in doc.ents:
	result["entities"].append({
	"text": ent.text,
	"type": ent.label_,
	"start": ent.start_char,
	"end": ent.end_char
	})

	return result

	def process_directory(self, input_dir: str, output_dir: str, save_embeddings: bool = True):
	"""
	Process all text files in directory

	Args:
	input_dir: Directory with text files
	output_dir: Output directory
	save_embeddings: Whether to save embeddings (can be large!)
	"""
	input_dir = Path(input_dir)
	output_dir = Path(output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	files = list(input_dir.glob("*.txt"))

	if not files:
	print(f" No .txt files found in {input_dir}")
	return []

	print(f"\n Found {len(files)} text files")
	print(f" Processing with Microsoft BiomedBERT...\n")

	all_results = []
	success_count = 0
	failed_count = 0

	for txt_file in tqdm(files, desc="Processing files"):
	try:
	text = txt_file.read_text(encoding="utf-8")

	result = self.process_text(text)
	result["filename"] = txt_file.stem

	# Don't save embeddings to JSON (too large)
	# Save them separately if needed
	if save_embeddings:
	# Save embeddings as numpy
	import numpy as np
	emb_file = output_dir / f"{txt_file.stem}_embedding.npy"
	np.save(emb_file, result["embeddings"])

	# Save entities and metadata (without embeddings)
	output_data = {
	"filename": result["filename"],
	"timestamp": result["timestamp"],
	"entities": result["entities"],
	"entity_count": len(result["entities"]),
	"has_embedding": save_embeddings
	}

	out_file = output_dir / f"{txt_file.stem}_nlp.json"
	with open(out_file, "w") as f:
	json.dump(output_data, f, indent=2)

	all_results.append(output_data)
	success_count += 1

	if success_count % 100 == 0:
	print(f"\n Progress: {success_count}/{len(files)} files")

	except Exception as e:
	failed_count += 1
	print(f"\n FAILED \| {txt_file.name} \| {e}")

	print(f"\n{'='*70}")
	print(f"PROCESSING COMPLETE")
	print(f"{'='*70}")
	print(f"Total files : {len(files)}")
	print(f"Successful : {success_count}")
	print(f"Failed : {failed_count}")
	print(f"Success rate : {(success_count/len(files)*100):.1f}%")
	print(f"{'='*70}")

	self._save_summary(all_results, output_dir)
	return all_results

	def _save_summary(self, results: List[Dict], output_dir: Path):
	"""Save processing summary"""
	summary = {
	"total_files": len(results),
	"total_entities": sum(len(r["entities"]) for r in results),
	"timestamp": datetime.now().isoformat(),
	"entity_types": {},
	"model": "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
	}

	for r in results:
	for e in r["entities"]:
	summary["entity_types"][e["type"]] = \
	summary["entity_types"].get(e["type"], 0) + 1

	with open(output_dir / "processing_summary.json", "w") as f:
	json.dump(summary, f, indent=2)

	print("\n" + "=" * 70)
	print("BIOMEDBERT PROCESSING SUMMARY")
	print("=" * 70)
	print(f"Model : Microsoft BiomedBERT")
	print(f"Total entities: {summary['total_entities']:,}")
	print(f"\nTop Entity Types:")
	sorted_types = sorted(summary["entity_types"].items(),
	key=lambda x: x[1], reverse=True)
	for etype, count in sorted_types[:15]:
	print(f" {etype:20s} : {count:6,}")
	print("=" * 70)


	def main():
	"""Main function"""
	print("= " * 20)
	print("MICROSOFT BIOMEDBERT PIPELINE")
	print("= " * 20)

	# CONFIGURE PATHS
	input_dir = "/usr/users/3d_dimension_est/selva_sur/RAG/output/text"
	output_dir = "/usr/users/3d_dimension_est/selva_sur/RAG/output/biomedbert_output"

	print(f"\nConfiguration:")
	print(f" Input : {input_dir}")
	print(f" Output : {output_dir}")
	print(f"\nModel:")
	print(f" • Microsoft BiomedBERT (HuggingFace)")
	print(f" • spaCy medical NER")
	print("="*70)

	try:
	pipeline = BiomedBERTPipeline()

	results = pipeline.process_directory(
	input_dir=input_dir,
	output_dir=output_dir,
	save_embeddings=True # Set False to save space
	)

	print(f"\n COMPLETE: {len(results)} files processed")
	print(f"Results: {output_dir}")

	except Exception as e:
	print(f"\n Error: {e}")
	import traceback
	traceback.print_exc()


	if __name__ == "__main__":
	main()