joelg commited on
Commit
610249f
·
1 Parent(s): abdb11b

- better reasoning traces hadling

Browse files

- higher tokens limit and default number
- changed default similairity threshold

Files changed (2) hide show
  1. app.py +6 -6
  2. rag_system.py +105 -38
app.py CHANGED
@@ -177,9 +177,9 @@ def create_interface():
177
  similarity_threshold = gr.Slider(
178
  minimum=0.0,
179
  maximum=1.0,
180
- value=0.0,
181
  step=0.05,
182
- label="Similarity Threshold (minimum score)"
183
  )
184
 
185
  # Tab 3: Generation Configuration
@@ -206,11 +206,11 @@ def create_interface():
206
  label="Temperature (creativity)"
207
  )
208
  max_tokens = gr.Slider(
209
- minimum=50,
210
- maximum=1000,
211
- value=300,
212
  step=50,
213
- label="Max Tokens (response length)"
214
  )
215
 
216
  # Tab 4: Query & Results
 
177
  similarity_threshold = gr.Slider(
178
  minimum=0.0,
179
  maximum=1.0,
180
+ value=0.5,
181
  step=0.05,
182
+ label="Similarity Threshold (minimum score - filters low-quality matches)"
183
  )
184
 
185
  # Tab 3: Generation Configuration
 
206
  label="Temperature (creativity)"
207
  )
208
  max_tokens = gr.Slider(
209
+ minimum=100,
210
+ maximum=2048,
211
+ value=800,
212
  step=50,
213
+ label="Max Tokens (response length - higher for reasoning models)"
214
  )
215
 
216
  # Tab 4: Query & Results
rag_system.py CHANGED
@@ -326,28 +326,43 @@ Question: {query}
326
 
327
  Answer:"""
328
 
329
- # Generate response using chat completion
330
  try:
331
- messages = [
332
- {
333
- "role": "user",
334
- "content": prompt
335
- }
336
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
- response = self.llm_client.chat_completion(
339
- messages=messages,
340
- max_tokens=max_tokens,
341
- temperature=temperature,
342
- )
343
-
344
- # Extract answer from response
345
- if hasattr(response, 'choices') and len(response.choices) > 0:
346
- answer = response.choices[0].message.content.strip()
347
- elif isinstance(response, dict) and 'choices' in response:
348
- answer = response['choices'][0]['message']['content'].strip()
349
- else:
350
- answer = str(response).strip()
351
 
352
  # Handle reasoning tokens (for models like Qwen)
353
  answer = self._process_reasoning_output(answer)
@@ -361,14 +376,19 @@ Answer:"""
361
 
362
  def _process_reasoning_output(self, text: str) -> str:
363
  """Process output from reasoning models to separate thinking from answer"""
 
 
 
364
  # Common patterns for reasoning models
365
- # Qwen uses <think>...</think> tags
366
- if '<think>' in text and '</think>' in text:
367
- # Extract reasoning and answer
368
- reasoning_match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
369
  if reasoning_match:
370
  reasoning = reasoning_match.group(1).strip()
371
- answer = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
 
 
372
 
373
  return f"""**Answer:**
374
 
@@ -383,6 +403,37 @@ Answer:"""
383
  {reasoning}
384
  ```
385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  </details>"""
387
 
388
  # Alternative pattern: text before "Answer:" or similar markers
@@ -393,6 +444,7 @@ Answer:"""
393
  answer = ''.join(parts[2:]).strip()
394
 
395
  if reasoning and len(reasoning) > 50: # Only if there's substantial reasoning
 
396
  return f"""**Answer:**
397
 
398
  {answer}
@@ -409,6 +461,7 @@ Answer:"""
409
  </details>"""
410
 
411
  # No reasoning pattern found, return as is
 
412
  return text
413
 
414
  def generate_example_questions(self, num_questions: int = 5) -> List[str]:
@@ -438,21 +491,35 @@ Text excerpts:
438
 
439
  Generate exactly {num_questions} questions, one per line, without numbering:"""
440
 
441
- messages = [{"role": "user", "content": prompt}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
 
443
- response = self.llm_client.chat_completion(
444
- messages=messages,
445
- max_tokens=300,
446
- temperature=0.8,
447
- )
448
 
449
- # Extract questions
450
- if hasattr(response, 'choices') and len(response.choices) > 0:
451
- questions_text = response.choices[0].message.content.strip()
452
- elif isinstance(response, dict) and 'choices' in response:
453
- questions_text = response['choices'][0]['message']['content'].strip()
454
- else:
455
- questions_text = str(response).strip()
 
456
 
457
  # Clean up reasoning if present
458
  questions_text = re.sub(r'<think>.*?</think>', '', questions_text, flags=re.DOTALL)
 
326
 
327
  Answer:"""
328
 
329
+ # Generate response - try chat_completion first, fallback to text_generation
330
  try:
331
+ # Try chat_completion first
332
+ try:
333
+ messages = [
334
+ {
335
+ "role": "user",
336
+ "content": prompt
337
+ }
338
+ ]
339
+
340
+ response = self.llm_client.chat_completion(
341
+ messages=messages,
342
+ max_tokens=max_tokens,
343
+ temperature=temperature,
344
+ )
345
+
346
+ # Extract answer from response
347
+ if hasattr(response, 'choices') and len(response.choices) > 0:
348
+ answer = response.choices[0].message.content.strip()
349
+ elif isinstance(response, dict) and 'choices' in response:
350
+ answer = response['choices'][0]['message']['content'].strip()
351
+ else:
352
+ answer = str(response).strip()
353
 
354
+ except Exception as chat_error:
355
+ # Fallback to text_generation
356
+ print(f"Chat completion failed, trying text_generation: {chat_error}")
357
+
358
+ response = self.llm_client.text_generation(
359
+ prompt,
360
+ max_new_tokens=max_tokens,
361
+ temperature=temperature,
362
+ return_full_text=False,
363
+ )
364
+
365
+ answer = response.strip() if isinstance(response, str) else str(response).strip()
 
366
 
367
  # Handle reasoning tokens (for models like Qwen)
368
  answer = self._process_reasoning_output(answer)
 
376
 
377
  def _process_reasoning_output(self, text: str) -> str:
378
  """Process output from reasoning models to separate thinking from answer"""
379
+ # Debug: print first 200 chars to see the format
380
+ print(f"[DEBUG] Processing output (first 200 chars): {text[:200]}")
381
+
382
  # Common patterns for reasoning models
383
+ # Qwen uses <think>...</think> tags (case-insensitive check)
384
+ if '<think>' in text.lower():
385
+ # Extract reasoning and answer (case-insensitive)
386
+ reasoning_match = re.search(r'<think>(.*?)</think>', text, re.DOTALL | re.IGNORECASE)
387
  if reasoning_match:
388
  reasoning = reasoning_match.group(1).strip()
389
+ answer = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL | re.IGNORECASE).strip()
390
+
391
+ print(f"[DEBUG] Found reasoning tokens! Reasoning length: {len(reasoning)}, Answer length: {len(answer)}")
392
 
393
  return f"""**Answer:**
394
 
 
403
  {reasoning}
404
  ```
405
 
406
+ </details>"""
407
+
408
+ # Alternative pattern: Look for common thinking patterns in text
409
+ # Some models output their reasoning inline without special tags
410
+ thinking_patterns = [
411
+ r'(Let me think.*?(?:Answer:|Response:|Conclusion:))',
412
+ r'(Okay, let\'s see.*?(?:Answer:|Response:|Conclusion:))',
413
+ r'(First,.*?(?:Therefore,|Thus,|So,|In conclusion,))',
414
+ ]
415
+
416
+ for pattern in thinking_patterns:
417
+ match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
418
+ if match:
419
+ reasoning = match.group(1).strip()
420
+ answer = text[match.end():].strip()
421
+
422
+ if len(reasoning) > 100 and len(answer) > 20: # Substantial reasoning and answer
423
+ print(f"[DEBUG] Found inline reasoning! Pattern matched.")
424
+ return f"""**Answer:**
425
+
426
+ {answer}
427
+
428
+ ---
429
+
430
+ <details>
431
+ <summary>🧠 Model Reasoning (click to expand)</summary>
432
+
433
+ ```
434
+ {reasoning}
435
+ ```
436
+
437
  </details>"""
438
 
439
  # Alternative pattern: text before "Answer:" or similar markers
 
444
  answer = ''.join(parts[2:]).strip()
445
 
446
  if reasoning and len(reasoning) > 50: # Only if there's substantial reasoning
447
+ print(f"[DEBUG] Found Answer: marker pattern")
448
  return f"""**Answer:**
449
 
450
  {answer}
 
461
  </details>"""
462
 
463
  # No reasoning pattern found, return as is
464
+ print(f"[DEBUG] No reasoning pattern found, returning as-is")
465
  return text
466
 
467
  def generate_example_questions(self, num_questions: int = 5) -> List[str]:
 
491
 
492
  Generate exactly {num_questions} questions, one per line, without numbering:"""
493
 
494
+ # Try chat_completion first, fallback to text_generation
495
+ try:
496
+ messages = [{"role": "user", "content": prompt}]
497
+
498
+ response = self.llm_client.chat_completion(
499
+ messages=messages,
500
+ max_tokens=300,
501
+ temperature=0.8,
502
+ )
503
+
504
+ # Extract questions
505
+ if hasattr(response, 'choices') and len(response.choices) > 0:
506
+ questions_text = response.choices[0].message.content.strip()
507
+ elif isinstance(response, dict) and 'choices' in response:
508
+ questions_text = response['choices'][0]['message']['content'].strip()
509
+ else:
510
+ questions_text = str(response).strip()
511
 
512
+ except Exception as chat_error:
513
+ print(f"Chat completion failed for questions, trying text_generation: {chat_error}")
 
 
 
514
 
515
+ response = self.llm_client.text_generation(
516
+ prompt,
517
+ max_new_tokens=300,
518
+ temperature=0.8,
519
+ return_full_text=False,
520
+ )
521
+
522
+ questions_text = response.strip() if isinstance(response, str) else str(response).strip()
523
 
524
  # Clean up reasoning if present
525
  questions_text = re.sub(r'<think>.*?</think>', '', questions_text, flags=re.DOTALL)