|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
os.environ["CHROMA_TELEMETRY_DISABLED"] = "1" |
|
|
|
|
|
from chromadb.config import Settings |
|
|
from langchain_chroma import Chroma |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
|
|
|
|
|
|
PERSIST_DIR = os.getenv("PERSIST_DIR", "./chroma_langchain") |
|
|
EMB_MODEL = os.getenv("EMB_MODEL", "nomic-ai/nomic-embed-text-v1.5") |
|
|
TOPK_DEF = int(os.getenv("TOPK", "5")) |
|
|
|
|
|
|
|
|
EMBEDDINGS = HuggingFaceEmbeddings( |
|
|
model_name=EMB_MODEL, |
|
|
model_kwargs={"trust_remote_code": True}, |
|
|
encode_kwargs={"normalize_embeddings": True}, |
|
|
) |
|
|
|
|
|
def load_vector_store(): |
|
|
""" |
|
|
Load the persisted Chroma collection with the embedding function for query-time encoding. |
|
|
Returns (vs, error_message_or_None) |
|
|
""" |
|
|
try: |
|
|
vs = Chroma( |
|
|
persist_directory=PERSIST_DIR, |
|
|
embedding_function=EMBEDDINGS, |
|
|
client_settings=Settings(anonymized_telemetry=False), |
|
|
) |
|
|
|
|
|
_ = vs._collection.count() |
|
|
return vs, None |
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
import chromadb |
|
|
client = chromadb.PersistentClient( |
|
|
path=PERSIST_DIR, settings=Settings(anonymized_telemetry=False) |
|
|
) |
|
|
existing = [c.name for c in client.list_collections()] |
|
|
except Exception: |
|
|
existing = [] |
|
|
msg = ( |
|
|
f"Failed to load Chroma store at '{PERSIST_DIR}'. " |
|
|
f"Existing collections: {existing or '—'}. " |
|
|
"Check that the index folder is present in the Space and the collection name matches." |
|
|
) |
|
|
return None, f"{msg}\n\nDetails: {e}" |
|
|
|
|
|
VS, LOAD_ERR = load_vector_store() |
|
|
|
|
|
def search(query: str, k: int = TOPK_DEF): |
|
|
if LOAD_ERR: |
|
|
return f"⚠️ {LOAD_ERR}" |
|
|
q = (query or "").strip() |
|
|
if not q: |
|
|
return "Please enter a query." |
|
|
try: |
|
|
results = VS.similarity_search_with_score(q, k=int(k)) |
|
|
except Exception as e: |
|
|
return f"Search failed: {e}" |
|
|
if not results: |
|
|
return "No results." |
|
|
|
|
|
lines = [f"### Top {len(results)} results"] |
|
|
for i, (doc, score) in enumerate(results, 1): |
|
|
meta = doc.metadata or {} |
|
|
src = meta.get("source") or meta.get("file_path") or "(no source)" |
|
|
snippet = (doc.page_content[:800] + "…") if len(doc.page_content) > 800 else doc.page_content |
|
|
lines.append(f"**[{i}]** \nSimilarity: `{score:.4f}`\n\n> {snippet}") |
|
|
lines.append("\n> **Disclaimer:** Models can produce incorrect or misleading statements. Verify with sources.") |
|
|
return "\n\n".join(lines) |
|
|
|
|
|
with gr.Blocks(title="Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
## Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1 |
|
|
**Datensatz: 21.603 Leitsatzentscheidungen des BGH (ab dem Jahr 2000) extrahiert aus https://zenodo.org/records/15153244** |
|
|
|
|
|
**Modell:** nomic-ai/nomic-embed-text-v1.5 |
|
|
|
|
|
**Wie es funktioniert:** Ermöglicht die semantische Suche im Datensatz und gibt die Entscheidungen geordnet nach Ähnlichkeitswerten zurück. |
|
|
|
|
|
**Versuche bespielsweise:** |
|
|
- `Kann KI Erfinder sein?` → erwartetes Aktenzeichen **X ZB 5/22** |
|
|
|
|
|
*Disclaimer:* Models may produce incorrect or misleading statements. Verify with sources. |
|
|
""" |
|
|
) |
|
|
with gr.Row(): |
|
|
q = gr.Textbox(label="Query", placeholder="Kann KI Erfinder sein?") |
|
|
k = gr.Slider(1, 20, value=TOPK_DEF, step=1, label="Top-K") |
|
|
out = gr.Markdown() |
|
|
gr.Button("Search").click(fn=search, inputs=[q, k], outputs=[out]) |
|
|
|
|
|
demo.launch() |
|
|
|