Spaces:

AbdullahKhanSherwani
/

BlackBox

Sleeping

BlackBox / src /data_prep /context_generator.py

hamzahisam

my changes

e1aa493 2 months ago

3.76 kB

	import json
	import logging
	from pathlib import Path
	from tqdm import tqdm
	import sys

	# Add project root to path
	project_root = Path(__file__).parent.parent.parent
	sys.path.append(str(project_root))

	from src.llm.ollama_client import call_ollama, is_ollama_running

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	CACHE_FILE = project_root / "data" / "processed" / "context_cache.json"

	CONTEXT_SYSTEM_PROMPT = """
	You are an expert AI extraction system helping to optimize documents for search retrieval.
	Your task is to concisely situate the given chunk within the context of the larger document.
	"""

	def generate_context_prompt(whole_document: str, chunk_content: str) -> str:
	"""
	Format strictly follows Anthropic's recommended Contextual Retrieval prompt.
	"""
	return f"""<document>
	{whole_document}
	</document>

	Here is the chunk we want to situate within the whole document:
	<chunk>
	{chunk_content}
	</chunk>

	Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""


	def contextualize_chunks(chunks: list[dict], whole_document: str, model: str = "qwen2.5:32b") -> list[dict]:
	"""
	Takes a list of chunk dictionaries and generates a context string for each one.
	The output chunk will have an augmented 'contextualized_text' field which prepends the context.

	Expected chunk format: {'chunk_id': '...', 'text': '...'}
	"""
	if not is_ollama_running():
	raise RuntimeError("Ollama is not running. Context generation requires the local LLM.")

	CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)

	# Load cache
	context_cache = {}
	if CACHE_FILE.exists():
	with open(CACHE_FILE, 'r', encoding='utf-8') as f:
	context_cache = json.load(f)

	contextualized_chunks = []

	for chunk in tqdm(chunks, desc="Generating contexts for chunks"):
	chunk_id = chunk.get('chunk_id')
	chunk_text = chunk.get('text', '')

	# Check cache first
	if chunk_id in context_cache:
	context = context_cache[chunk_id]
	else:
	# Generate new context
	prompt = generate_context_prompt(whole_document, chunk_text)
	try:
	# Temperature 0 for deterministic, factual outputs
	context = call_ollama(
	prompt=prompt,
	system_prompt=CONTEXT_SYSTEM_PROMPT,
	model=model,
	temperature=0.0,
	max_tokens=150
	)

	# Save to cache strictly as text
	context = context.strip()
	context_cache[chunk_id] = context

	# Write back cache periodically or immediately
	with open(CACHE_FILE, 'w', encoding='utf-8') as f:
	json.dump(context_cache, f, indent=4)

	except Exception as e:
	logger.error(f"Failed to generate context for chunk {chunk_id}: {str(e)}")
	context = ""

	# Build contextualized text
	augmented_text = f"{context}\n\n{chunk_text}" if context else chunk_text

	# Create a new chunk dict preserving original structure
	new_chunk = chunk.copy() if isinstance(chunk, dict) else dict(chunk)
	new_chunk['context'] = context
	new_chunk['contextualized_text'] = augmented_text
	contextualized_chunks.append(new_chunk)

	return contextualized_chunks