| from pydantic import BaseModel, Field |
| from typing import List, Union |
| import os |
|
|
| class Chunk(BaseModel): |
| source_number: int = Field(..., description="Index (1-indexed) of the SPECIFIC source which justifies the answer") |
| source_id: str = Field(..., description="The _id of the SPECIFIC source which justifies the answer") |
| file: str = Field(..., description="The file_name of the source file") |
| page: Union[int,str] = Field(..., description="Page number or label") |
|
|
| class CitedAnswer(BaseModel): |
| answer: str = Field(..., description="Answer text to the user question with inline citation markers like <sup>[1]</sup> injected at the end of the sentence or paragraph they refer to.") |
| sources: List[Chunk] = Field(..., description="Citations from the given sources that justify the answer. If no sources are utilised return an empty list.") |
| |
| |
| def format_artifacts_to_string(artifacts: List[dict]) -> str: |
| """ |
| Takes a list of artifact dicts (each with .metadata and .page_content) |
| and returns a single formatted string ready for an LLM prompt: |
| |
| Source Number: 0 |
| Source ID: <id> |
| Filename: <filename> |
| Page number: <page> |
| Chunk: <text> |
| |
| with blank lines between entries. |
| """ |
| parts = [] |
| for idx, art in enumerate(artifacts): |
| meta = art.metadata |
| chunk_id = meta.get("_id", "No ID") |
| file_name = os.path.basename(meta.get("source", "")) |
| page_number = meta.get("page", "No page number") |
| text = art.page_content.strip().replace("\n", " ") |
| parts.append( |
| f"Source Number: {idx}\n" |
| f"Source ID: {chunk_id}\n" |
| f"Filename: {file_name}\n" |
| f"Page number: {page_number}\n" |
| f"Chunk: {text}" |
| ) |
| return "\n\n".join(parts) |
|
|
| def fix_page_number(incorrect_page_number: str) -> str: |
| if incorrect_page_number == "No page number": |
| return incorrect_page_number |
| else: |
| correct_page_number = int(incorrect_page_number) + 1 |
| return str(correct_page_number) |
|
|
| def embed_references(result: dict) -> str: |
| """ |
| Given an LLM result dict with keys: |
| - "answer": str |
| - "sources": list of { |
| "source_number": int, # 0-based |
| "file": str, |
| "page": int, |
| "end_source": int # character index in answer |
| } |
| Returns a single Markdown-formatted string with: |
| 1) <sup>[n]</sup> injected at each end_source position |
| 2) a "**References:**" list at the end. |
| """ |
| answer = result.answer |
| |
| sources = sorted(result.sources, key=lambda s: s.source_number) |
| |
| |
| refs = ["", "**Referenzen:**"] |
| for src in sources: |
| idx = src.source_number |
| refs.append(f"[{idx}] {src.file}, Seite {fix_page_number(src.page)}") |
| |
| return answer + "\n\n" + "\n\n".join(refs) |