Julia Ostheimer
Fix page number and show in German
a685766
from pydantic import BaseModel, Field
from typing import List, Union
import os
class Chunk(BaseModel):
source_number: int = Field(..., description="Index (1-indexed) of the SPECIFIC source which justifies the answer")
source_id: str = Field(..., description="The _id of the SPECIFIC source which justifies the answer")
file: str = Field(..., description="The file_name of the source file")
page: Union[int,str] = Field(..., description="Page number or label")
class CitedAnswer(BaseModel):
answer: str = Field(..., description="Answer text to the user question with inline citation markers like <sup>[1]</sup> injected at the end of the sentence or paragraph they refer to.")
sources: List[Chunk] = Field(..., description="Citations from the given sources that justify the answer. If no sources are utilised return an empty list.")
def format_artifacts_to_string(artifacts: List[dict]) -> str:
"""
Takes a list of artifact dicts (each with .metadata and .page_content)
and returns a single formatted string ready for an LLM prompt:
Source Number: 0
Source ID: <id>
Filename: <filename>
Page number: <page>
Chunk: <text>
with blank lines between entries.
"""
parts = []
for idx, art in enumerate(artifacts):
meta = art.metadata
chunk_id = meta.get("_id", "No ID")
file_name = os.path.basename(meta.get("source", ""))
page_number = meta.get("page", "No page number")
text = art.page_content.strip().replace("\n", " ")
parts.append(
f"Source Number: {idx}\n"
f"Source ID: {chunk_id}\n"
f"Filename: {file_name}\n"
f"Page number: {page_number}\n"
f"Chunk: {text}"
)
return "\n\n".join(parts)
def fix_page_number(incorrect_page_number: str) -> str:
if incorrect_page_number == "No page number":
return incorrect_page_number
else:
correct_page_number = int(incorrect_page_number) + 1
return str(correct_page_number)
def embed_references(result: dict) -> str:
"""
Given an LLM result dict with keys:
- "answer": str
- "sources": list of {
"source_number": int, # 0-based
"file": str,
"page": int,
"end_source": int # character index in answer
}
Returns a single Markdown-formatted string with:
1) <sup>[n]</sup> injected at each end_source position
2) a "**References:**" list at the end.
"""
answer = result.answer
# Sort by source_number to ensure consistent ordering
sources = sorted(result.sources, key=lambda s: s.source_number)
# Build the references section
refs = ["", "**Referenzen:**"]
for src in sources:
idx = src.source_number
refs.append(f"[{idx}] {src.file}, Seite {fix_page_number(src.page)}")
return answer + "\n\n" + "\n\n".join(refs)