Spaces:

evaluatorhub42
/

Prototyp_Chatbot_Kontextanalyse_2

Sleeping

Prototyp_Chatbot_Kontextanalyse_2 / conversation /citation_utils.py

Julia Ostheimer

Fix page number and show in German

a685766 9 months ago

2.98 kB

	from pydantic import BaseModel, Field
	from typing import List, Union
	import os

	class Chunk(BaseModel):
	source_number: int = Field(..., description="Index (1-indexed) of the SPECIFIC source which justifies the answer")
	source_id: str = Field(..., description="The _id of the SPECIFIC source which justifies the answer")
	file: str = Field(..., description="The file_name of the source file")
	page: Union[int,str] = Field(..., description="Page number or label")

	class CitedAnswer(BaseModel):
	answer: str = Field(..., description="Answer text to the user question with inline citation markers like <sup>[1]</sup> injected at the end of the sentence or paragraph they refer to.")
	sources: List[Chunk] = Field(..., description="Citations from the given sources that justify the answer. If no sources are utilised return an empty list.")


	def format_artifacts_to_string(artifacts: List[dict]) -> str:
	"""
	Takes a list of artifact dicts (each with .metadata and .page_content)
	and returns a single formatted string ready for an LLM prompt:

	Source Number: 0
	Source ID: <id>
	Filename: <filename>
	Page number: <page>
	Chunk: <text>

	with blank lines between entries.
	"""
	parts = []
	for idx, art in enumerate(artifacts):
	meta = art.metadata
	chunk_id = meta.get("_id", "No ID")
	file_name = os.path.basename(meta.get("source", ""))
	page_number = meta.get("page", "No page number")
	text = art.page_content.strip().replace("\n", " ")
	parts.append(
	f"Source Number: {idx}\n"
	f"Source ID: {chunk_id}\n"
	f"Filename: {file_name}\n"
	f"Page number: {page_number}\n"
	f"Chunk: {text}"
	)
	return "\n\n".join(parts)

	def fix_page_number(incorrect_page_number: str) -> str:
	if incorrect_page_number == "No page number":
	return incorrect_page_number
	else:
	correct_page_number = int(incorrect_page_number) + 1
	return str(correct_page_number)

	def embed_references(result: dict) -> str:
	"""
	Given an LLM result dict with keys:
	- "answer": str
	- "sources": list of {
	"source_number": int, # 0-based
	"file": str,
	"page": int,
	"end_source": int # character index in answer
	}
	Returns a single Markdown-formatted string with:
	1) <sup>[n]</sup> injected at each end_source position
	2) a "References:" list at the end.
	"""
	answer = result.answer
	# Sort by source_number to ensure consistent ordering
	sources = sorted(result.sources, key=lambda s: s.source_number)

	# Build the references section
	refs = ["", "Referenzen:"]
	for src in sources:
	idx = src.source_number
	refs.append(f"[{idx}] {src.file}, Seite {fix_page_number(src.page)}")

	return answer + "\n\n" + "\n\n".join(refs)