Spaces:

hugging2021
/

rag-the-game-changer

Build error

App Files Files Community

rag-the-game-changer / generation_components /output_formatting.py

hugging2021

Upload folder using huggingface_hub

40f6dcf verified about 2 months ago

raw

history blame contribute delete

9.74 kB

	"""
	Output Formatting - RAG-The-Game-Changer

	Advanced output formatting and structuring for RAG responses.
	"""

	import json
	import logging
	from typing import Any, Dict, List, Optional, Union
	from dataclasses import dataclass, field
	from enum import Enum

	logger = logging.getLogger(__name__)


	class OutputFormat(Enum):
	"""Supported output formats."""

	JSON = "json"
	MARKDOWN = "markdown"
	HTML = "html"
	PLAIN_TEXT = "plain_text"
	XML = "xml"


	@dataclass
	class FormattedOutput:
	"""Formatted response with structure and citations."""

	answer: str
	format_type: OutputFormat
	formatted_content: str
	citations: List[Dict[str, Any]]
	metadata: Dict[str, Any] = field(default_factory=dict)
	raw_data: Any = None


	class OutputFormatter:
	"""Format and structure RAG outputs."""

	def __init__(self, config: Optional[Dict[str, Any]] = None):
	self.config = config or {}
	self.default_format = OutputFormat(self.config.get("default_format", "markdown"))
	self.include_citations = self.config.get("include_citations", True)
	self.include_confidence = self.config.get("include_confidence", False)
	self.include_sources = self.config.get("include_sources", True)

	def format_output(
	self,
	answer: str,
	retrieved_chunks: List[Dict],
	confidence: float,
	format_type: Optional[OutputFormat] = None,
	) -> FormattedOutput:
	"""Format output according to specified format."""
	fmt = format_type or self.default_format
	citations = self._extract_citations(retrieved_chunks)

	formatted_content = self._generate_formatted_content(
	answer=answer,
	format_type=fmt,
	citations=citations,
	confidence=confidence,
	sources=retrieved_chunks,
	)

	return FormattedOutput(
	answer=answer,
	format_type=fmt,
	formatted_content=formatted_content,
	citations=citations,
	metadata={
	"confidence": confidence if self.include_confidence else None,
	"chunk_count": len(retrieved_chunks),
	},
	raw_data={"answer": answer, "chunks": retrieved_chunks, "confidence": confidence},
	)

	def _extract_citations(self, chunks: List[Dict]) -> List[Dict[str, Any]]:
	"""Extract citation information from retrieved chunks."""
	citations = []

	for i, chunk in enumerate(chunks):
	citation = {
	"index": i + 1,
	"id": chunk.get("chunk_id", f"chunk_{i}"),
	"document_id": chunk.get("document_id", ""),
	"title": chunk.get("title", "Unknown"),
	"source": chunk.get("source", "Unknown"),
	"content": chunk.get("content", "")[:100] + "...",
	}

	if chunk.get("metadata"):
	citation["metadata"] = chunk["metadata"]

	citations.append(citation)

	return citations

	def _generate_formatted_content(
	self,
	answer: str,
	format_type: OutputFormat,
	citations: List[Dict],
	confidence: float,
	sources: List[Dict],
	) -> str:
	"""Generate formatted content based on format type."""

	if format_type == OutputFormat.JSON:
	return self._format_json(answer, citations, confidence, sources)

	elif format_type == OutputFormat.MARKDOWN:
	return self._format_markdown(answer, citations, confidence, sources)

	elif format_type == OutputFormat.HTML:
	return self._format_html(answer, citations, confidence, sources)

	elif format_type == OutputFormat.XML:
	return self._format_xml(answer, citations, confidence, sources)

	else: # PLAIN_TEXT
	return self._format_plain_text(answer, citations, sources)

	def _format_json(
	self, answer: str, citations: List[Dict], confidence: float, sources: List[Dict]
	) -> str:
	"""Format output as JSON."""
	output = {"answer": answer}

	if self.include_citations:
	output["citations"] = citations

	if self.include_confidence:
	output["confidence"] = confidence

	if self.include_sources:
	output["sources"] = [
	{"id": c.get("id"), "title": c.get("title"), "source": c.get("source")}
	for c in citations
	]

	return json.dumps(output, indent=2)

	def _format_markdown(
	self, answer: str, citations: List[Dict], confidence: float, sources: List[Dict]
	) -> str:
	"""Format output as Markdown."""
	lines = [answer]

	if self.include_citations and citations:
	lines.append("\n\nSources:")
	for citation in citations:
	lines.append(f"\n[{citation['index']}] {citation['title']}")
	lines.append(f" - Source: {citation['source']}")
	lines.append(f" - Document ID: {citation['document_id']}")

	if self.include_confidence:
	lines.append(f"\n\nConfidence: {confidence:.1%}")

	return "\n".join(lines)

	def _format_html(
	self, answer: str, citations: List[Dict], confidence: float, sources: List[Dict]
	) -> str:
	"""Format output as HTML."""
	html = f"<div class='rag-response'>\n"
	html += f" <p class='answer'>{answer}</p>\n"

	if self.include_citations and citations:
	html += " <div class='citations'>\n"
	html += " <h3>Sources:</h3>\n"
	html += " <ul>\n"
	for citation in citations:
	html += (
	f" <li data-id='{citation['id']}' data-source='{citation['source']}'>\n"
	)
	html += f" <strong>{citation['index']}. {citation['title']}</strong>\n"
	html += f" <br/><small>{citation['source']}</small>\n"
	html += " </li>\n"
	html += " </ul>\n"
	html += " </div>\n"

	if self.include_confidence:
	html += f" <div class='confidence'>Confidence: {confidence:.1%}</div>\n"

	html += "</div>"
	return html

	def _format_xml(
	self, answer: str, citations: List[Dict], confidence: float, sources: List[Dict]
	) -> str:
	"""Format output as XML."""
	xml = '<?xml version="1.0" encoding="UTF-8"?>\n'
	xml += "<response>\n"
	xml += f" <answer>{answer}</answer>\n"

	if self.include_citations and citations:
	xml += " <citations>\n"
	for citation in citations:
	xml += f" <citation id='{citation['id']}' source='{citation['source']}'>\n"
	xml += f" <title>{citation['title']}</title>\n"
	xml += f" <document_id>{citation['document_id']}</document_id>\n"
	xml += " </citation>\n"
	xml += " </citations>\n"

	if self.include_confidence:
	xml += f" <confidence>{confidence:.4f}</confidence>\n"

	xml += "</response>"
	return xml

	def _format_plain_text(self, answer: str, citations: List[Dict], sources: List[Dict]) -> str:
	"""Format output as plain text."""
	lines = [answer]

	if self.include_citations and citations:
	lines.append("\n\nSources:")
	for citation in citations:
	lines.append(f"\n[{citation['index']}] {citation['title']} ({citation['source']})")

	return "\n".join(lines)


	class StructuredOutputGenerator:
	"""Generate structured outputs for different use cases."""

	@staticmethod
	def generate_qa_pair(
	question: str,
	answer: str,
	context: Optional[str] = None,
	confidence: Optional[float] = None,
	) -> Dict[str, Any]:
	"""Generate Q&A pair structure."""
	qa_pair = {"question": question, "answer": answer}

	if context:
	qa_pair["context"] = context

	if confidence is not None:
	qa_pair["confidence"] = confidence

	return qa_pair

	@staticmethod
	def generate_search_result(
	query: str, results: List[Dict], total_results: int, search_time_ms: float
	) -> Dict[str, Any]:
	"""Generate search result structure."""
	return {
	"query": query,
	"results": results,
	"total_results": total_results,
	"search_time_ms": search_time_ms,
	"results_count": len(results),
	}

	@staticmethod
	def generate_conversation_turn(
	user_message: str,
	assistant_message: str,
	turn_id: str,
	context_used: List[str],
	confidence: float,
	metadata: Optional[Dict] = None,
	) -> Dict[str, Any]:
	"""Generate conversation turn structure."""
	return {
	"turn_id": turn_id,
	"user": user_message,
	"assistant": assistant_message,
	"context_used": {
	"count": len(context_used),
	"summaries": [c[:100] + "..." for c in context_used],
	},
	"confidence": confidence,
	"metadata": metadata or {},
	}

	@staticmethod
	def generate_document_insight(
	document_id: str, summary: str, key_points: List[str], topics: List[str], confidence: float
	) -> Dict[str, Any]:
	"""Generate document insight structure."""
	return {
	"document_id": document_id,
	"summary": summary,
	"key_points": key_points,
	"topics": topics,
	"confidence": confidence,
	}