Spaces:

mtyrrell
/

chabo_test

Sleeping

App Files Files Community

chabo_test / src /components /generator /sources.py

mtyrrell

sources fix

f32f200 13 days ago

raw

history blame contribute delete

8.84 kB

	import re
	import logging
	logger = logging.getLogger(__name__)
	from typing import List, Dict, Any, Optional, Tuple
	from langchain_core.documents import Document


	# ---------------------------------------------------------------------
	# Core Processing Functions
	# ---------------------------------------------------------------------
	def parse_citations(response: str) -> List[int]:
	"""Parse citation numbers from response text, handling both [1] and [1,2,3] formats"""
	# Match both single citations [1] and comma-separated citations [1,2,3,4,5]
	citation_pattern = r'\[([\d,\s]+)\]'
	matches = re.findall(citation_pattern, response)

	citation_numbers = set()
	for match in matches:
	# Split by comma and extract all numbers
	numbers = re.findall(r'\d+', match)
	citation_numbers.update(int(num) for num in numbers)

	citation_numbers = sorted(list(citation_numbers))
	logger.debug(f"Probable Citations found: {citation_numbers}")
	return citation_numbers

	def extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
	"""Extract sources that were cited in the response"""
	if not cited_numbers:
	return []

	cited_sources = []
	for citation_num in cited_numbers:
	source_index = citation_num - 1

	if 0 <= source_index < len(processed_results):
	source = processed_results[source_index].copy() # Make copy to avoid modifying original
	source['_citation_number'] = citation_num # Preserve original citation number
	cited_sources.append(source)
	#logger.debug(f"Extracted citations : {cited_sources}")

	return cited_sources

	def clean_citations(response: str) -> str:
	"""Normalize all citation formats to [x] and remove unwanted sections"""

	# Remove References/Sources/Bibliography sections
	ref_patterns = [
	r'\n\s#+\sReferences?\s:?.$',
	r'\n\s#+\sSources?\s:?.$',
	r'\n\s#+\sBibliography\s:?.$',
	r'\n\sReferences?\s:.*$',
	r'\n\sSources?\s:.*$',
	r'\n\sBibliography\s:.*$',
	]
	for pattern in ref_patterns:
	response = re.sub(pattern, '', response, flags=re.IGNORECASE \| re.DOTALL)

	# Fix (Document X, Page Y, Year Z) -> [X]
	response = re.sub(
	r'$Document\s+(\d+)(?:,\sPage\s+\d+)?(?:,\s(?:Year\s+)?\d+)?$',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix [Document X, Page Y, Year Z] -> [X]
	response = re.sub(
	r'\[Document\s+(\d+)(?:[^\]]*)\]',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix [Document X: filename, Page Y, Year Z] -> [X]
	response = re.sub(
	r'\[Document\s+(\d+):[^\]]+\]',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix [X.Y.Z] style (section numbers) -> [X]
	response = re.sub(
	r'\[(\d+)\.[\d\.]+\]',
	r'[\1]',
	response
	)

	# Fix (Document X) -> [X]
	response = re.sub(
	r'$Document\s+(\d+)$',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
	response = re.sub(
	r'Document\s+(\d+)(?:,\sPage\s+\d+)?(?:,\s(?:Year\s+)?\d+)?(?=\s\|[,.])',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Fix "Document X states/says/mentions" -> [X]
	response = re.sub(
	r'Document\s+(\d+)\s+(?:states\|says\|mentions\|reports\|indicates\|notes\|shows)',
	r'[\1]',
	response,
	flags=re.IGNORECASE
	)

	# Clean up any double citations [[1]] -> [1]
	response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)

	# Clean up multiple spaces
	response = re.sub(r'\s+', ' ', response)

	return response.strip()

	def process_context(
	context: List[Document],
	metadata_fields_to_include: Optional[List[str]] = None
	) -> Tuple[str, List[Dict[str, Any]]]:
	"""
	Processes LangChain Documents, extracts content and selected metadata,
	and returns the formatted context string and the processed list of results.

	Args:
	context: A list of LangChain Document objects from the retriever.
	metadata_fields_to_include: Optional list of metadata keys (e.g., ['source', 'page'])
	to include in the formatted context string sent to the LLM.

	Returns:
	A tuple: (formatted_context_string, processed_results_list)
	"""
	logger.debug(f"Context Processing: \n Context: {context} \n Metadata_fileds: {metadata_fields_to_include}")
	logger.info(f"Context Processing with Metadata_fileds_to_include: {metadata_fields_to_include}")

	# 1. Input Validation
	if not isinstance(context, list) or not all(isinstance(doc, Document) for doc in context):
	# Raise a specific error if input is not what's expected
	raise ValueError("Context must be a list of LangChain Document objects.")
	if not context:
	return "", []

	processed_results = []
	metadata_fields_to_include = metadata_fields_to_include or []

	# 2. Standardize Structure and Build Context String
	context_parts = []

	for i, doc in enumerate(context, 1):
	# The primary dictionary that holds all info for this document
	doc_info = {
	'answer': doc.page_content,
	'__all_metadata__': doc.metadata, # Store all metadata for citation linking later
	'_citation_number_key': i # Store the citation number
	}
	#logger.debug(f"DocInfo of {i} context: {doc_info}")

	# Extract selected metadata fields for the prompt string
	metadata_str_parts = []
	for field in metadata_fields_to_include:
	value = doc.metadata.get(field)
	if value is not None:
	# Store the value in the doc_info dict and format for the prompt string
	doc_info[field] = value

	# Format the field for readability in the prompt
	field_name = field.replace('_', ' ').title()
	metadata_str_parts.append(f"{field_name}: {value}")

	# Build the document string
	if metadata_str_parts:
	metadata_line = " \| ".join(metadata_str_parts)
	# Example output: [1] (Type: decision, Meeting ID: 123)
	context_str = f"[{i}] This is Metadata \n ({metadata_line})\n Contextual Text \n {doc.page_content}"
	else:
	context_str = f"[{i}]\n{doc.page_content}"

	logger.debug(f" Updated Context {i}: {context_str}")
	context_parts.append(context_str)
	processed_results.append(doc_info) # Collect the standardized dict for later use
	formatted_context = "\n---\n".join(context_parts)

	return formatted_context, processed_results

	def create_sources_list(
	cited_sources: List[Dict[str, Any]],
	title_metadata_fields: List[str],
	link_metadata_field: str
	) -> List[Dict[str, str]]:
	"""
	Create sources list for ChatUI format using configuration for title and link fields.

	Args:
	cited_sources: List of standardized dictionaries that were cited.
	title_metadata_fields: List of metadata keys (e.g., ['document_type', 'decision_number'])
	to use to build the source title.
	link_metadata_field: The single metadata key (e.g., 'document_url') to use for the source link (URL).
	"""
	sources = []
	logger.info("creating sources list for ChatUI")
	logger.debug(f"Raw Cited sources: {cited_sources}")

	for result in cited_sources:
	# We access the original, full metadata dictionary
	all_meta = result.get('__all_metadata__', {})
	citation_num = result.get('_citation_number', 'N/A')

	# 1. Build Title using configured fields
	title_parts = []
	for field in title_metadata_fields:
	value = all_meta.get(field)
	if value is not None:
	title_parts.append(str(value))

	# Create a descriptive title
	title = " - ".join(title_parts) if title_parts else f"Source {citation_num}"

	# 2. Extract Link using configured field (use 'doc://#' as fallback for empty/missing)
	# ChatUI requires URLs to match doc://, http://, or https:// schemes
	link = all_meta.get(link_metadata_field)
	if not link or link == '#':
	link = 'doc://#' # Use doc:// scheme for placeholder links

	sources.append({
	"uri": link,
	"title": title
	})

	logger.debug(f"formatted cited sources :{sources}")

	return sources