Spaces:

neuralworm
/

rag-embeddinggemma-gemma-3

Runtime error

App Files Files Community

neuralworm commited on Nov 18, 2025

Commit

1e88950

1 Parent(s): 3aa1632

fix

Browse files

Files changed (3) hide show

app.py +85 -52
test/wikitop10.txt +0 -0
test/wikitop100.txt +0 -0

app.py CHANGED Viewed

@@ -1,17 +1,17 @@
-# app.py - v2.0 (Production)
-# Beschreibung: Finale, stabile und produktionsreife Version. Löst das letzte Streaming-Problem
-#               durch den Einsatz des korrekten `TextIteratorStreamer`, der für die
-#               programmatische Iteration in UIs wie Gradio entwickelt wurde.
 import os
 import torch
 import gradio as gr
 from typing import List, Tuple, Generator, Dict
 from threading import Thread
 # ML / Transformers
-# HIER DIE KORREKTUR: TextIteratorStreamer statt TextStreamer
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 # Dokumentenverarbeitung & RAG
@@ -32,6 +32,10 @@ LLM_MODEL: Gemma3ForConditionalGeneration = None
 LLM_PROCESSOR: AutoProcessor = None
 VECTOR_STORE: FAISS = None
 # --------------------------------------------------------------------
 # Model Loading
 # --------------------------------------------------------------------
@@ -42,16 +46,20 @@ def get_device() -> torch.device:
 def get_embedding_function() -> HuggingFaceEmbeddings:
     global EMBEDDING_FUNCTION
     if EMBEDDING_FUNCTION is None:
         EMBEDDING_FUNCTION = HuggingFaceEmbeddings(
             model_name=EMBED_MODEL_ID,
-            model_kwargs={'device': get_device()}
         )
     return EMBEDDING_FUNCTION
 def get_llm() -> Tuple[Gemma3ForConditionalGeneration, AutoProcessor]:
     global LLM_MODEL, LLM_PROCESSOR
     if LLM_MODEL is None or LLM_PROCESSOR is None:
         device = get_device()
         dtype = torch.bfloat16 if "cuda" in device.type else torch.float32
         LLM_MODEL = Gemma3ForConditionalGeneration.from_pretrained(
             LLM_MODEL_ID,
@@ -59,6 +67,7 @@ def get_llm() -> Tuple[Gemma3ForConditionalGeneration, AutoProcessor]:
             device_map="auto",
         ).eval()
         LLM_PROCESSOR = AutoProcessor.from_pretrained(LLM_MODEL_ID)
     return LLM_MODEL, LLM_PROCESSOR
 # --------------------------------------------------------------------
@@ -90,12 +99,13 @@ def get_text_splitter() -> RecursiveCharacterTextSplitter:
 # --------------------------------------------------------------------
 def index_files(file_paths: List[str], progress=gr.Progress(track_tqdm=True)) -> str:
     global VECTOR_STORE
-    if not file_paths: return "No files selected for indexing."
     embedding_function = get_embedding_function()
     text_splitter = get_text_splitter()
     documents: List[Document] = []
-    for path in progress.tqdm(file_paths, desc="1/2: Processing & chunking files"):
         if path is None: continue
         text = extract_text_from_file(path)
         if not text.strip(): continue
@@ -105,133 +115,156 @@ def index_files(file_paths: List[str], progress=gr.Progress(track_tqdm=True)) ->
             doc = Document(page_content=chunk, metadata={"source": source_name})
             documents.append(doc)
-    if not documents: return "No text could be extracted from the files."
-    progress(0.5, desc="2/2: Creating embeddings & building FAISS index...")
     new_store = FAISS.from_documents(documents, embedding_function)
     if VECTOR_STORE is None: VECTOR_STORE = new_store
     else: VECTOR_STORE.add_documents(documents)
     final_count = VECTOR_STORE.index.ntotal
-    return f"Index updated: {final_count} chunks in total."
 def clear_index() -> str:
     global VECTOR_STORE
     VECTOR_STORE = None
     import gc; gc.collect()
-    return "Index cleared."
 def retrieve_relevant_chunks(query: str, top_k: int = 5) -> List[Dict]:
-    if VECTOR_STORE is None: return []
     results_with_scores = VECTOR_STORE.similarity_search_with_score(query, k=top_k)
-    return [{
         "content": doc.page_content,
-        "source": doc.metadata.get("source", "Unknown"),
         "score": 1 - score
     } for doc, score in results_with_scores]
 # --------------------------------------------------------------------
 # LLM-Generierung mit Streaming
 # --------------------------------------------------------------------
 def build_rag_prompt(user_question: str, retrieved_chunks: List[Dict]) -> str:
-    # ... (Diese Funktion bleibt unverändert)
     if not retrieved_chunks:
-        context_str = "No relevant context documents were found."
     else:
         context_parts = []
         for i, ch in enumerate(retrieved_chunks, start=1):
-            context_parts.append(
-                f"Document [{i}] (Source: {ch['source']}, Relevance: {ch['score']:.3f}):\n\"{ch['content']}\""
-            )
         context_str = "\n\n".join(context_parts)
-    prompt = (f"You are a precise, helpful assistant. Your task is to answer the following user question based "
-              f"exclusively on the context documents provided below. "
-              f"If the answer is not contained within the documents, state clearly: 'The information is not available in the provided documents.' "
-              f"Answer in German, summarizing the relevant information instead of quoting verbatim.\n\n"
-              f"--- Context Documents ---\n{context_str}\n\n"
-              f"--- User Question ---\n{user_question}\n\n"
-              f"--- Your Answer ---\n")
     return prompt
 def answer_with_rag(question: str, history: list) -> Generator[str, None, None]:
     model, processor = get_llm()
-    # HIER DIE KORREKTUR: TextIteratorStreamer verwenden
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     retrieved = retrieve_relevant_chunks(question, top_k=5)
     prompt = build_rag_prompt(question, retrieved)
     messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
     input_ids = processor.apply_chat_template(
         messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
     ).to(model.device)
     generation_kwargs = {
-        "input_ids": input_ids,
-        "streamer": streamer,
-        "max_new_tokens": 1024,
-        "do_sample": True,
-        "temperature": 0.7,
-        "top_p": 0.9,
     }
-    # Die Generierung muss in einem separaten Thread laufen, damit wir im Haupt-Thread
-    # über den Streamer iterieren können.
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # Jetzt können wir über den Streamer iterieren und die Tokens an die UI weitergeben.
     for new_text in streamer:
         yield new_text
 # --------------------------------------------------------------------
 # Gradio UI
 # --------------------------------------------------------------------
 def build_demo() -> gr.Blocks:
-    with gr.Blocks(title="Gemma 3 RAG v2.0", theme="soft") as demo:
         gr.Markdown(
             """
-            # 🔍 Gemma 3 RAG v2.0 – Production Ready
-            **A State-of-the-Art RAG pipeline with `google/embeddinggemma-300m` and `google/gemma-3-4b-it`**
-            1.  Upload your documents and click "Update Index".
-            2.  Ask your questions in the chat window. The answers will be streamed live.
             """
         )
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("### 📁 Document Management")
-                file_uploader = gr.File(label="Upload Files (.pdf, .txt, .md)", file_count="multiple", type="filepath")
                 with gr.Row():
-                    index_button = gr.Button("🔄 Update Index", variant="primary")
-                    clear_index_button = gr.Button("🧹 Clear Index")
-                index_status = gr.Markdown("Index is empty.")
                 index_button.click(fn=index_files, inputs=file_uploader, outputs=index_status)
                 clear_index_button.click(fn=clear_index, inputs=None, outputs=index_status)
             with gr.Column(scale=2):
-                gr.Markdown("### 💬 Chat About Your Documents")
                 chatbot = gr.Chatbot(label="Gemma-3 Chat", type="messages", show_copy_button=True, height=600, render_markdown=True)
                 with gr.Row():
-                    msg_textbox = gr.Textbox(label="Your Question", placeholder="Ask something about the uploaded documents...", scale=4, autofocus=True)
-                    send_btn = gr.Button("Send", variant="primary", scale=1)
         def chat_interface(message: str, history: list):
             if not message or not message.strip(): return history
             history.append({"role": "user", "content": message})
             history.append({"role": "assistant", "content": ""})
             for token in answer_with_rag(message, history):
                 history[-1]["content"] += token
                 yield history
         msg_textbox.submit(fn=chat_interface, inputs=[msg_textbox, chatbot], outputs=chatbot).then(fn=lambda: gr.update(value=""), outputs=msg_textbox)
         send_btn.click(fn=chat_interface, inputs=[msg_textbox, chatbot], outputs=chatbot).then(fn=lambda: gr.update(value=""), outputs=msg_textbox)
     return demo
 if __name__ == "__main__":
-    print("Starting application... Initializing models.")
     get_embedding_function()
     get_llm()
     app_demo = build_demo()
-    print("Models loaded. Launching Gradio interface.")
     app_demo.launch()

+# app.py - v2.1 (Debug Edition)
+# Beschreibung: Kombiniert die funktionale Stabilität der v2.0 mit dem umfangreichen
+#               Debugging und den Assertions früherer Versionen. Diese Version ist ideal
+#               für die Entwicklung, Fehlersuche und das Verständnis der internen Abläufe.
 import os
 import torch
 import gradio as gr
+import time
 from typing import List, Tuple, Generator, Dict
 from threading import Thread
 # ML / Transformers
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 # Dokumentenverarbeitung & RAG
 LLM_PROCESSOR: AutoProcessor = None
 VECTOR_STORE: FAISS = None
+def print_debug(message: str):
+    """Konsistente Debug-Ausgabe mit Zeitstempel."""
+    print(f"[DEBUG {time.strftime('%H:%M:%S')}] {message}")
 # --------------------------------------------------------------------
 # Model Loading
 # --------------------------------------------------------------------
 def get_embedding_function() -> HuggingFaceEmbeddings:
     global EMBEDDING_FUNCTION
     if EMBEDDING_FUNCTION is None:
+        device = get_device()
+        print_debug(f"Initialisiere Embedding-Modell '{EMBED_MODEL_ID}' auf Device '{device}'.")
         EMBEDDING_FUNCTION = HuggingFaceEmbeddings(
             model_name=EMBED_MODEL_ID,
+            model_kwargs={'device': device}
         )
+        print_debug("Embedding-Modell erfolgreich initialisiert.")
     return EMBEDDING_FUNCTION
 def get_llm() -> Tuple[Gemma3ForConditionalGeneration, AutoProcessor]:
     global LLM_MODEL, LLM_PROCESSOR
     if LLM_MODEL is None or LLM_PROCESSOR is None:
         device = get_device()
+        print_debug(f"Initialisiere LLM '{LLM_MODEL_ID}' auf Device '{device}'.")
         dtype = torch.bfloat16 if "cuda" in device.type else torch.float32
         LLM_MODEL = Gemma3ForConditionalGeneration.from_pretrained(
             LLM_MODEL_ID,
             device_map="auto",
         ).eval()
         LLM_PROCESSOR = AutoProcessor.from_pretrained(LLM_MODEL_ID)
+        print_debug("LLM und Prozessor erfolgreich initialisiert.")
     return LLM_MODEL, LLM_PROCESSOR
 # --------------------------------------------------------------------
 # --------------------------------------------------------------------
 def index_files(file_paths: List[str], progress=gr.Progress(track_tqdm=True)) -> str:
     global VECTOR_STORE
+    if not file_paths: return "Keine Dateien zum Indexieren ausgewählt."
+    print_debug(f"Indexierung gestartet für {len(file_paths)} Datei(en).")
     embedding_function = get_embedding_function()
     text_splitter = get_text_splitter()
     documents: List[Document] = []
+    for path in progress.tqdm(file_paths, desc="1/2: Dateien verarbeiten & chunken"):
         if path is None: continue
         text = extract_text_from_file(path)
         if not text.strip(): continue
             doc = Document(page_content=chunk, metadata={"source": source_name})
             documents.append(doc)
+    assert all(isinstance(d, Document) for d in documents), "Alle Elemente in 'documents' müssen vom Typ langchain.Document sein."
+    print_debug(f"Erfolgreich {len(documents)} Chunks aus den Dateien erstellt.")
+    if not documents: return "Kein Text in den Dateien gefunden, der indexiert werden konnte."
+    progress(0.5, desc="2/2: Embeddings erstellen & FAISS Index aufbauen...")
     new_store = FAISS.from_documents(documents, embedding_function)
+    print_debug("FAISS Index erfolgreich aus Dokumenten erstellt.")
     if VECTOR_STORE is None: VECTOR_STORE = new_store
     else: VECTOR_STORE.add_documents(documents)
+    assert VECTOR_STORE is not None and VECTOR_STORE.index.ntotal > 0, "VECTOR_STORE wurde nicht korrekt initialisiert."
     final_count = VECTOR_STORE.index.ntotal
+    print_debug(f"Indexierung abgeschlossen. Gesamtanzahl der Chunks im Index: {final_count}")
+    return f"Index aktualisiert: {final_count} Chunks insgesamt."
 def clear_index() -> str:
     global VECTOR_STORE
     VECTOR_STORE = None
     import gc; gc.collect()
+    print_debug("Vektor-Index wurde geleert.")
+    return "Index geleert."
 def retrieve_relevant_chunks(query: str, top_k: int = 5) -> List[Dict]:
+    if VECTOR_STORE is None:
+        print_debug("Retrieval versucht, aber Vektor-Index ist leer.")
+        return []
+    print_debug(f"Suche nach {top_k} relevanten Chunks für die Anfrage: '{query}'")
     results_with_scores = VECTOR_STORE.similarity_search_with_score(query, k=top_k)
+    formatted_results = [{
         "content": doc.page_content,
+        "source": doc.metadata.get("source", "Unbekannt"),
         "score": 1 - score
     } for doc, score in results_with_scores]
+    assert isinstance(formatted_results, list), "Retrieval-Ergebnis muss eine Liste sein."
+    if formatted_results:
+        assert all("content" in r and "source" in r and "score" in r for r in formatted_results), "Jedes Retrieval-Ergebnis muss 'content', 'source' und 'score' enthalten."
+    print_debug(f"{len(formatted_results)} Chunks gefunden.")
+    return formatted_results
 # --------------------------------------------------------------------
 # LLM-Generierung mit Streaming
 # --------------------------------------------------------------------
 def build_rag_prompt(user_question: str, retrieved_chunks: List[Dict]) -> str:
     if not retrieved_chunks:
+        context_str = "Es wurden keine relevanten Dokumente im Kontext gefunden."
     else:
         context_parts = []
         for i, ch in enumerate(retrieved_chunks, start=1):
+            context_parts.append(f"Dokument [{i}] (Quelle: {ch['source']}, Relevanz: {ch['score']:.3f}):\n\"{ch['content']}\"")
         context_str = "\n\n".join(context_parts)
+    prompt = (f"Du bist ein präziser, hilfreicher Assistent. Deine Aufgabe ist es, die folgende Benutzerfrage ausschließlich "
+              f"basierend auf den unten stehenden Kontext-Dokumenten zu beantworten. "
+              f"Wenn die Antwort nicht in den Dokumenten enthalten ist, gib klar an: 'Die Information ist in den bereitgestellten Dokumenten nicht enthalten.' "
+              f"Antworte auf Deutsch und fasse die relevanten Informationen zusammen, anstatt die Dokumente wörtlich zu zitieren.\n\n"
+              f"--- Kontext-Dokumente ---\n{context_str}\n\n"
+              f"--- Benutzerfrage ---\n{user_question}\n\n"
+              f"--- Deine Antwort ---\n")
     return prompt
 def answer_with_rag(question: str, history: list) -> Generator[str, None, None]:
+    print_debug("Starte RAG-Antwort-Generierung.")
     model, processor = get_llm()
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     retrieved = retrieve_relevant_chunks(question, top_k=5)
     prompt = build_rag_prompt(question, retrieved)
+    print_debug(f"Generierter RAG-Prompt (erste 200 Zeichen): '{prompt}'")
     messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
+    print_debug(f"Nachrichten-Struktur wird für Prozessor vorbereitet: {str(messages)}")
+    assert isinstance(messages, list) and len(messages) > 0, "Messages muss eine nicht-leere Liste sein."
+    assert isinstance(messages[0], dict) and "role" in messages[0] and "content" in messages[0], "Nachricht muss ein Dictionary mit 'role' und 'content' sein."
+    assert isinstance(messages[0]["content"], list) and len(messages[0]["content"]) > 0, "Content muss eine nicht-leere Liste sein."
+    assert isinstance(messages[0]["content"][0], dict) and "type" in messages[0]["content"][0] and "text" in messages[0]["content"][0], "Content-Block muss ein Dictionary mit 'type' und 'text' sein."
+    print_debug("ASSERTIONS für Nachrichten-Struktur erfolgreich bestanden.")
     input_ids = processor.apply_chat_template(
         messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
     ).to(model.device)
+    assert isinstance(input_ids, torch.Tensor), "Der Prozessor sollte einen torch.Tensor zurückgeben."
+    print_debug(f"Prozessor hat 'input_ids' mit der Form {input_ids.shape} erstellt.")
     generation_kwargs = {
+        "input_ids": input_ids, "streamer": streamer, "max_new_tokens": 1024,
+        "do_sample": True, "temperature": 0.7, "top_p": 0.9,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    print_debug("LLM-Generierungs-Thread gestartet.")
     for new_text in streamer:
         yield new_text
+    print_debug("LLM-Generierung abgeschlossen.")
 # --------------------------------------------------------------------
 # Gradio UI
 # --------------------------------------------------------------------
 def build_demo() -> gr.Blocks:
+    with gr.Blocks(title="Gemma 3 RAG v2.1", theme="soft") as demo:
         gr.Markdown(
             """
+            # 🔍 Gemma 3 RAG v2.1 – Debug Edition
+            **Eine "State of the Art" RAG-Pipeline mit `google/embeddinggemma-300m` und `google/gemma-3-4b-it`**
+            Diese Version enthält umfangreiche Debug-Ausgaben in der Konsole.
             """
         )
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("### 📁 Dokumenten-Management")
+                file_uploader = gr.File(label="Dateien hochladen (.pdf, .txt, .md)", file_count="multiple", type="filepath")
                 with gr.Row():
+                    index_button = gr.Button("🔄 Index aktualisieren", variant="primary")
+                    clear_index_button = gr.Button("🧹 Index leeren")
+                index_status = gr.Markdown("Index ist leer.")
                 index_button.click(fn=index_files, inputs=file_uploader, outputs=index_status)
                 clear_index_button.click(fn=clear_index, inputs=None, outputs=index_status)
             with gr.Column(scale=2):
+                gr.Markdown("### 💬 Chat über deine Dokumente")
                 chatbot = gr.Chatbot(label="Gemma-3 Chat", type="messages", show_copy_button=True, height=600, render_markdown=True)
                 with gr.Row():
+                    msg_textbox = gr.Textbox(label="Deine Frage", placeholder="Stelle eine Frage zu den Dokumenten...", scale=4, autofocus=True)
+                    send_btn = gr.Button("Senden", variant="primary", scale=1)
         def chat_interface(message: str, history: list):
             if not message or not message.strip(): return history
+            print_debug(f"Neue Benutzernachricht empfangen: '{message}'")
             history.append({"role": "user", "content": message})
             history.append({"role": "assistant", "content": ""})
             for token in answer_with_rag(message, history):
                 history[-1]["content"] += token
                 yield history
+            print_debug("Streaming an die UI beendet.")
         msg_textbox.submit(fn=chat_interface, inputs=[msg_textbox, chatbot], outputs=chatbot).then(fn=lambda: gr.update(value=""), outputs=msg_textbox)
         send_btn.click(fn=chat_interface, inputs=[msg_textbox, chatbot], outputs=chatbot).then(fn=lambda: gr.update(value=""), outputs=msg_textbox)
     return demo
 if __name__ == "__main__":
+    print("Anwendung wird gestartet... Modelle werden initialisiert.")
     get_embedding_function()
     get_llm()
     app_demo = build_demo()
+    print("Modelle geladen. Gradio-Interface wird gestartet.")
     app_demo.launch()

test/wikitop10.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

test/wikitop100.txt ADDED Viewed

The diff for this file is too large to render. See raw diff