neuralworm commited on
Commit
1e88950
·
1 Parent(s): 3aa1632
Files changed (3) hide show
  1. app.py +85 -52
  2. test/wikitop10.txt +0 -0
  3. test/wikitop100.txt +0 -0
app.py CHANGED
@@ -1,17 +1,17 @@
1
- # app.py - v2.0 (Production)
2
- # Beschreibung: Finale, stabile und produktionsreife Version. Löst das letzte Streaming-Problem
3
- # durch den Einsatz des korrekten `TextIteratorStreamer`, der für die
4
- # programmatische Iteration in UIs wie Gradio entwickelt wurde.
5
 
6
  import os
7
  import torch
8
  import gradio as gr
 
9
 
10
  from typing import List, Tuple, Generator, Dict
11
  from threading import Thread
12
 
13
  # ML / Transformers
14
- # HIER DIE KORREKTUR: TextIteratorStreamer statt TextStreamer
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
16
 
17
  # Dokumentenverarbeitung & RAG
@@ -32,6 +32,10 @@ LLM_MODEL: Gemma3ForConditionalGeneration = None
32
  LLM_PROCESSOR: AutoProcessor = None
33
  VECTOR_STORE: FAISS = None
34
 
 
 
 
 
35
  # --------------------------------------------------------------------
36
  # Model Loading
37
  # --------------------------------------------------------------------
@@ -42,16 +46,20 @@ def get_device() -> torch.device:
42
  def get_embedding_function() -> HuggingFaceEmbeddings:
43
  global EMBEDDING_FUNCTION
44
  if EMBEDDING_FUNCTION is None:
 
 
45
  EMBEDDING_FUNCTION = HuggingFaceEmbeddings(
46
  model_name=EMBED_MODEL_ID,
47
- model_kwargs={'device': get_device()}
48
  )
 
49
  return EMBEDDING_FUNCTION
50
 
51
  def get_llm() -> Tuple[Gemma3ForConditionalGeneration, AutoProcessor]:
52
  global LLM_MODEL, LLM_PROCESSOR
53
  if LLM_MODEL is None or LLM_PROCESSOR is None:
54
  device = get_device()
 
55
  dtype = torch.bfloat16 if "cuda" in device.type else torch.float32
56
  LLM_MODEL = Gemma3ForConditionalGeneration.from_pretrained(
57
  LLM_MODEL_ID,
@@ -59,6 +67,7 @@ def get_llm() -> Tuple[Gemma3ForConditionalGeneration, AutoProcessor]:
59
  device_map="auto",
60
  ).eval()
61
  LLM_PROCESSOR = AutoProcessor.from_pretrained(LLM_MODEL_ID)
 
62
  return LLM_MODEL, LLM_PROCESSOR
63
 
64
  # --------------------------------------------------------------------
@@ -90,12 +99,13 @@ def get_text_splitter() -> RecursiveCharacterTextSplitter:
90
  # --------------------------------------------------------------------
91
  def index_files(file_paths: List[str], progress=gr.Progress(track_tqdm=True)) -> str:
92
  global VECTOR_STORE
93
- if not file_paths: return "No files selected for indexing."
 
94
 
95
  embedding_function = get_embedding_function()
96
  text_splitter = get_text_splitter()
97
  documents: List[Document] = []
98
- for path in progress.tqdm(file_paths, desc="1/2: Processing & chunking files"):
99
  if path is None: continue
100
  text = extract_text_from_file(path)
101
  if not text.strip(): continue
@@ -105,133 +115,156 @@ def index_files(file_paths: List[str], progress=gr.Progress(track_tqdm=True)) ->
105
  doc = Document(page_content=chunk, metadata={"source": source_name})
106
  documents.append(doc)
107
 
108
- if not documents: return "No text could be extracted from the files."
 
 
109
 
110
- progress(0.5, desc="2/2: Creating embeddings & building FAISS index...")
111
  new_store = FAISS.from_documents(documents, embedding_function)
 
112
 
113
  if VECTOR_STORE is None: VECTOR_STORE = new_store
114
  else: VECTOR_STORE.add_documents(documents)
115
 
 
116
  final_count = VECTOR_STORE.index.ntotal
117
- return f"Index updated: {final_count} chunks in total."
 
118
 
119
  def clear_index() -> str:
120
  global VECTOR_STORE
121
  VECTOR_STORE = None
122
  import gc; gc.collect()
123
- return "Index cleared."
 
124
 
125
  def retrieve_relevant_chunks(query: str, top_k: int = 5) -> List[Dict]:
126
- if VECTOR_STORE is None: return []
 
 
 
 
127
  results_with_scores = VECTOR_STORE.similarity_search_with_score(query, k=top_k)
128
- return [{
 
129
  "content": doc.page_content,
130
- "source": doc.metadata.get("source", "Unknown"),
131
  "score": 1 - score
132
  } for doc, score in results_with_scores]
133
 
 
 
 
 
 
 
 
134
  # --------------------------------------------------------------------
135
  # LLM-Generierung mit Streaming
136
  # --------------------------------------------------------------------
137
  def build_rag_prompt(user_question: str, retrieved_chunks: List[Dict]) -> str:
138
- # ... (Diese Funktion bleibt unverändert)
139
  if not retrieved_chunks:
140
- context_str = "No relevant context documents were found."
141
  else:
142
  context_parts = []
143
  for i, ch in enumerate(retrieved_chunks, start=1):
144
- context_parts.append(
145
- f"Document [{i}] (Source: {ch['source']}, Relevance: {ch['score']:.3f}):\n\"{ch['content']}\""
146
- )
147
  context_str = "\n\n".join(context_parts)
148
- prompt = (f"You are a precise, helpful assistant. Your task is to answer the following user question based "
149
- f"exclusively on the context documents provided below. "
150
- f"If the answer is not contained within the documents, state clearly: 'The information is not available in the provided documents.' "
151
- f"Answer in German, summarizing the relevant information instead of quoting verbatim.\n\n"
152
- f"--- Context Documents ---\n{context_str}\n\n"
153
- f"--- User Question ---\n{user_question}\n\n"
154
- f"--- Your Answer ---\n")
155
  return prompt
156
 
157
  def answer_with_rag(question: str, history: list) -> Generator[str, None, None]:
 
158
  model, processor = get_llm()
159
- # HIER DIE KORREKTUR: TextIteratorStreamer verwenden
160
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
161
 
162
  retrieved = retrieve_relevant_chunks(question, top_k=5)
163
  prompt = build_rag_prompt(question, retrieved)
 
 
164
  messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
165
 
 
 
 
 
 
 
 
166
  input_ids = processor.apply_chat_template(
167
  messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
168
  ).to(model.device)
169
 
 
 
 
170
  generation_kwargs = {
171
- "input_ids": input_ids,
172
- "streamer": streamer,
173
- "max_new_tokens": 1024,
174
- "do_sample": True,
175
- "temperature": 0.7,
176
- "top_p": 0.9,
177
  }
178
 
179
- # Die Generierung muss in einem separaten Thread laufen, damit wir im Haupt-Thread
180
- # über den Streamer iterieren können.
181
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
182
  thread.start()
 
183
 
184
- # Jetzt können wir über den Streamer iterieren und die Tokens an die UI weitergeben.
185
  for new_text in streamer:
186
  yield new_text
 
187
 
188
  # --------------------------------------------------------------------
189
  # Gradio UI
190
  # --------------------------------------------------------------------
191
  def build_demo() -> gr.Blocks:
192
- with gr.Blocks(title="Gemma 3 RAG v2.0", theme="soft") as demo:
193
  gr.Markdown(
194
  """
195
- # 🔍 Gemma 3 RAG v2.0Production Ready
196
- **A State-of-the-Art RAG pipeline with `google/embeddinggemma-300m` and `google/gemma-3-4b-it`**
197
- 1. Upload your documents and click "Update Index".
198
- 2. Ask your questions in the chat window. The answers will be streamed live.
199
  """
200
  )
201
  with gr.Row():
202
  with gr.Column(scale=1):
203
- gr.Markdown("### 📁 Document Management")
204
- file_uploader = gr.File(label="Upload Files (.pdf, .txt, .md)", file_count="multiple", type="filepath")
205
  with gr.Row():
206
- index_button = gr.Button("🔄 Update Index", variant="primary")
207
- clear_index_button = gr.Button("🧹 Clear Index")
208
- index_status = gr.Markdown("Index is empty.")
209
  index_button.click(fn=index_files, inputs=file_uploader, outputs=index_status)
210
  clear_index_button.click(fn=clear_index, inputs=None, outputs=index_status)
211
  with gr.Column(scale=2):
212
- gr.Markdown("### 💬 Chat About Your Documents")
213
  chatbot = gr.Chatbot(label="Gemma-3 Chat", type="messages", show_copy_button=True, height=600, render_markdown=True)
214
  with gr.Row():
215
- msg_textbox = gr.Textbox(label="Your Question", placeholder="Ask something about the uploaded documents...", scale=4, autofocus=True)
216
- send_btn = gr.Button("Send", variant="primary", scale=1)
217
 
218
  def chat_interface(message: str, history: list):
219
  if not message or not message.strip(): return history
 
220
  history.append({"role": "user", "content": message})
221
  history.append({"role": "assistant", "content": ""})
222
  for token in answer_with_rag(message, history):
223
  history[-1]["content"] += token
224
  yield history
 
225
 
226
  msg_textbox.submit(fn=chat_interface, inputs=[msg_textbox, chatbot], outputs=chatbot).then(fn=lambda: gr.update(value=""), outputs=msg_textbox)
227
  send_btn.click(fn=chat_interface, inputs=[msg_textbox, chatbot], outputs=chatbot).then(fn=lambda: gr.update(value=""), outputs=msg_textbox)
228
  return demo
229
 
230
  if __name__ == "__main__":
231
- print("Starting application... Initializing models.")
232
  get_embedding_function()
233
  get_llm()
234
 
235
  app_demo = build_demo()
236
- print("Models loaded. Launching Gradio interface.")
237
  app_demo.launch()
 
1
+ # app.py - v2.1 (Debug Edition)
2
+ # Beschreibung: Kombiniert die funktionale Stabilität der v2.0 mit dem umfangreichen
3
+ # Debugging und den Assertions früherer Versionen. Diese Version ist ideal
4
+ # für die Entwicklung, Fehlersuche und das Verständnis der internen Abläufe.
5
 
6
  import os
7
  import torch
8
  import gradio as gr
9
+ import time
10
 
11
  from typing import List, Tuple, Generator, Dict
12
  from threading import Thread
13
 
14
  # ML / Transformers
 
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
16
 
17
  # Dokumentenverarbeitung & RAG
 
32
  LLM_PROCESSOR: AutoProcessor = None
33
  VECTOR_STORE: FAISS = None
34
 
35
+ def print_debug(message: str):
36
+ """Konsistente Debug-Ausgabe mit Zeitstempel."""
37
+ print(f"[DEBUG {time.strftime('%H:%M:%S')}] {message}")
38
+
39
  # --------------------------------------------------------------------
40
  # Model Loading
41
  # --------------------------------------------------------------------
 
46
  def get_embedding_function() -> HuggingFaceEmbeddings:
47
  global EMBEDDING_FUNCTION
48
  if EMBEDDING_FUNCTION is None:
49
+ device = get_device()
50
+ print_debug(f"Initialisiere Embedding-Modell '{EMBED_MODEL_ID}' auf Device '{device}'.")
51
  EMBEDDING_FUNCTION = HuggingFaceEmbeddings(
52
  model_name=EMBED_MODEL_ID,
53
+ model_kwargs={'device': device}
54
  )
55
+ print_debug("Embedding-Modell erfolgreich initialisiert.")
56
  return EMBEDDING_FUNCTION
57
 
58
  def get_llm() -> Tuple[Gemma3ForConditionalGeneration, AutoProcessor]:
59
  global LLM_MODEL, LLM_PROCESSOR
60
  if LLM_MODEL is None or LLM_PROCESSOR is None:
61
  device = get_device()
62
+ print_debug(f"Initialisiere LLM '{LLM_MODEL_ID}' auf Device '{device}'.")
63
  dtype = torch.bfloat16 if "cuda" in device.type else torch.float32
64
  LLM_MODEL = Gemma3ForConditionalGeneration.from_pretrained(
65
  LLM_MODEL_ID,
 
67
  device_map="auto",
68
  ).eval()
69
  LLM_PROCESSOR = AutoProcessor.from_pretrained(LLM_MODEL_ID)
70
+ print_debug("LLM und Prozessor erfolgreich initialisiert.")
71
  return LLM_MODEL, LLM_PROCESSOR
72
 
73
  # --------------------------------------------------------------------
 
99
  # --------------------------------------------------------------------
100
  def index_files(file_paths: List[str], progress=gr.Progress(track_tqdm=True)) -> str:
101
  global VECTOR_STORE
102
+ if not file_paths: return "Keine Dateien zum Indexieren ausgewählt."
103
+ print_debug(f"Indexierung gestartet für {len(file_paths)} Datei(en).")
104
 
105
  embedding_function = get_embedding_function()
106
  text_splitter = get_text_splitter()
107
  documents: List[Document] = []
108
+ for path in progress.tqdm(file_paths, desc="1/2: Dateien verarbeiten & chunken"):
109
  if path is None: continue
110
  text = extract_text_from_file(path)
111
  if not text.strip(): continue
 
115
  doc = Document(page_content=chunk, metadata={"source": source_name})
116
  documents.append(doc)
117
 
118
+ assert all(isinstance(d, Document) for d in documents), "Alle Elemente in 'documents' müssen vom Typ langchain.Document sein."
119
+ print_debug(f"Erfolgreich {len(documents)} Chunks aus den Dateien erstellt.")
120
+ if not documents: return "Kein Text in den Dateien gefunden, der indexiert werden konnte."
121
 
122
+ progress(0.5, desc="2/2: Embeddings erstellen & FAISS Index aufbauen...")
123
  new_store = FAISS.from_documents(documents, embedding_function)
124
+ print_debug("FAISS Index erfolgreich aus Dokumenten erstellt.")
125
 
126
  if VECTOR_STORE is None: VECTOR_STORE = new_store
127
  else: VECTOR_STORE.add_documents(documents)
128
 
129
+ assert VECTOR_STORE is not None and VECTOR_STORE.index.ntotal > 0, "VECTOR_STORE wurde nicht korrekt initialisiert."
130
  final_count = VECTOR_STORE.index.ntotal
131
+ print_debug(f"Indexierung abgeschlossen. Gesamtanzahl der Chunks im Index: {final_count}")
132
+ return f"Index aktualisiert: {final_count} Chunks insgesamt."
133
 
134
  def clear_index() -> str:
135
  global VECTOR_STORE
136
  VECTOR_STORE = None
137
  import gc; gc.collect()
138
+ print_debug("Vektor-Index wurde geleert.")
139
+ return "Index geleert."
140
 
141
  def retrieve_relevant_chunks(query: str, top_k: int = 5) -> List[Dict]:
142
+ if VECTOR_STORE is None:
143
+ print_debug("Retrieval versucht, aber Vektor-Index ist leer.")
144
+ return []
145
+
146
+ print_debug(f"Suche nach {top_k} relevanten Chunks für die Anfrage: '{query}'")
147
  results_with_scores = VECTOR_STORE.similarity_search_with_score(query, k=top_k)
148
+
149
+ formatted_results = [{
150
  "content": doc.page_content,
151
+ "source": doc.metadata.get("source", "Unbekannt"),
152
  "score": 1 - score
153
  } for doc, score in results_with_scores]
154
 
155
+ assert isinstance(formatted_results, list), "Retrieval-Ergebnis muss eine Liste sein."
156
+ if formatted_results:
157
+ assert all("content" in r and "source" in r and "score" in r for r in formatted_results), "Jedes Retrieval-Ergebnis muss 'content', 'source' und 'score' enthalten."
158
+
159
+ print_debug(f"{len(formatted_results)} Chunks gefunden.")
160
+ return formatted_results
161
+
162
  # --------------------------------------------------------------------
163
  # LLM-Generierung mit Streaming
164
  # --------------------------------------------------------------------
165
  def build_rag_prompt(user_question: str, retrieved_chunks: List[Dict]) -> str:
 
166
  if not retrieved_chunks:
167
+ context_str = "Es wurden keine relevanten Dokumente im Kontext gefunden."
168
  else:
169
  context_parts = []
170
  for i, ch in enumerate(retrieved_chunks, start=1):
171
+ context_parts.append(f"Dokument [{i}] (Quelle: {ch['source']}, Relevanz: {ch['score']:.3f}):\n\"{ch['content']}\"")
 
 
172
  context_str = "\n\n".join(context_parts)
173
+ prompt = (f"Du bist ein präziser, hilfreicher Assistent. Deine Aufgabe ist es, die folgende Benutzerfrage ausschließlich "
174
+ f"basierend auf den unten stehenden Kontext-Dokumenten zu beantworten. "
175
+ f"Wenn die Antwort nicht in den Dokumenten enthalten ist, gib klar an: 'Die Information ist in den bereitgestellten Dokumenten nicht enthalten.' "
176
+ f"Antworte auf Deutsch und fasse die relevanten Informationen zusammen, anstatt die Dokumente wörtlich zu zitieren.\n\n"
177
+ f"--- Kontext-Dokumente ---\n{context_str}\n\n"
178
+ f"--- Benutzerfrage ---\n{user_question}\n\n"
179
+ f"--- Deine Antwort ---\n")
180
  return prompt
181
 
182
  def answer_with_rag(question: str, history: list) -> Generator[str, None, None]:
183
+ print_debug("Starte RAG-Antwort-Generierung.")
184
  model, processor = get_llm()
 
185
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
186
 
187
  retrieved = retrieve_relevant_chunks(question, top_k=5)
188
  prompt = build_rag_prompt(question, retrieved)
189
+ print_debug(f"Generierter RAG-Prompt (erste 200 Zeichen): '{prompt}'")
190
+
191
  messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
192
 
193
+ print_debug(f"Nachrichten-Struktur wird für Prozessor vorbereitet: {str(messages)}")
194
+ assert isinstance(messages, list) and len(messages) > 0, "Messages muss eine nicht-leere Liste sein."
195
+ assert isinstance(messages[0], dict) and "role" in messages[0] and "content" in messages[0], "Nachricht muss ein Dictionary mit 'role' und 'content' sein."
196
+ assert isinstance(messages[0]["content"], list) and len(messages[0]["content"]) > 0, "Content muss eine nicht-leere Liste sein."
197
+ assert isinstance(messages[0]["content"][0], dict) and "type" in messages[0]["content"][0] and "text" in messages[0]["content"][0], "Content-Block muss ein Dictionary mit 'type' und 'text' sein."
198
+ print_debug("ASSERTIONS für Nachrichten-Struktur erfolgreich bestanden.")
199
+
200
  input_ids = processor.apply_chat_template(
201
  messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
202
  ).to(model.device)
203
 
204
+ assert isinstance(input_ids, torch.Tensor), "Der Prozessor sollte einen torch.Tensor zurückgeben."
205
+ print_debug(f"Prozessor hat 'input_ids' mit der Form {input_ids.shape} erstellt.")
206
+
207
  generation_kwargs = {
208
+ "input_ids": input_ids, "streamer": streamer, "max_new_tokens": 1024,
209
+ "do_sample": True, "temperature": 0.7, "top_p": 0.9,
 
 
 
 
210
  }
211
 
 
 
212
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
213
  thread.start()
214
+ print_debug("LLM-Generierungs-Thread gestartet.")
215
 
 
216
  for new_text in streamer:
217
  yield new_text
218
+ print_debug("LLM-Generierung abgeschlossen.")
219
 
220
  # --------------------------------------------------------------------
221
  # Gradio UI
222
  # --------------------------------------------------------------------
223
  def build_demo() -> gr.Blocks:
224
+ with gr.Blocks(title="Gemma 3 RAG v2.1", theme="soft") as demo:
225
  gr.Markdown(
226
  """
227
+ # 🔍 Gemma 3 RAG v2.1Debug Edition
228
+ **Eine "State of the Art" RAG-Pipeline mit `google/embeddinggemma-300m` und `google/gemma-3-4b-it`**
229
+ Diese Version enthält umfangreiche Debug-Ausgaben in der Konsole.
 
230
  """
231
  )
232
  with gr.Row():
233
  with gr.Column(scale=1):
234
+ gr.Markdown("### 📁 Dokumenten-Management")
235
+ file_uploader = gr.File(label="Dateien hochladen (.pdf, .txt, .md)", file_count="multiple", type="filepath")
236
  with gr.Row():
237
+ index_button = gr.Button("🔄 Index aktualisieren", variant="primary")
238
+ clear_index_button = gr.Button("🧹 Index leeren")
239
+ index_status = gr.Markdown("Index ist leer.")
240
  index_button.click(fn=index_files, inputs=file_uploader, outputs=index_status)
241
  clear_index_button.click(fn=clear_index, inputs=None, outputs=index_status)
242
  with gr.Column(scale=2):
243
+ gr.Markdown("### 💬 Chat über deine Dokumente")
244
  chatbot = gr.Chatbot(label="Gemma-3 Chat", type="messages", show_copy_button=True, height=600, render_markdown=True)
245
  with gr.Row():
246
+ msg_textbox = gr.Textbox(label="Deine Frage", placeholder="Stelle eine Frage zu den Dokumenten...", scale=4, autofocus=True)
247
+ send_btn = gr.Button("Senden", variant="primary", scale=1)
248
 
249
  def chat_interface(message: str, history: list):
250
  if not message or not message.strip(): return history
251
+ print_debug(f"Neue Benutzernachricht empfangen: '{message}'")
252
  history.append({"role": "user", "content": message})
253
  history.append({"role": "assistant", "content": ""})
254
  for token in answer_with_rag(message, history):
255
  history[-1]["content"] += token
256
  yield history
257
+ print_debug("Streaming an die UI beendet.")
258
 
259
  msg_textbox.submit(fn=chat_interface, inputs=[msg_textbox, chatbot], outputs=chatbot).then(fn=lambda: gr.update(value=""), outputs=msg_textbox)
260
  send_btn.click(fn=chat_interface, inputs=[msg_textbox, chatbot], outputs=chatbot).then(fn=lambda: gr.update(value=""), outputs=msg_textbox)
261
  return demo
262
 
263
  if __name__ == "__main__":
264
+ print("Anwendung wird gestartet... Modelle werden initialisiert.")
265
  get_embedding_function()
266
  get_llm()
267
 
268
  app_demo = build_demo()
269
+ print("Modelle geladen. Gradio-Interface wird gestartet.")
270
  app_demo.launch()
test/wikitop10.txt ADDED
The diff for this file is too large to render. See raw diff
 
test/wikitop100.txt ADDED
The diff for this file is too large to render. See raw diff