Spaces:

joelg
/

discover_rag

Sleeping

App Files Files Community

joelg commited on Oct 8

Commit

610249f

1 Parent(s): abdb11b

- better reasoning traces hadling

Browse files

- higher tokens limit and default number
- changed default similairity threshold

Files changed (2) hide show

app.py +6 -6
rag_system.py +105 -38

app.py CHANGED Viewed

@@ -177,9 +177,9 @@ def create_interface():
                     similarity_threshold = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
-                        value=0.0,
                         step=0.05,
-                        label="Similarity Threshold (minimum score)"
                     )
             # Tab 3: Generation Configuration
@@ -206,11 +206,11 @@ def create_interface():
                         label="Temperature (creativity)"
                     )
                     max_tokens = gr.Slider(
-                        minimum=50,
-                        maximum=1000,
-                        value=300,
                         step=50,
-                        label="Max Tokens (response length)"
                     )
             # Tab 4: Query & Results

                     similarity_threshold = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
+                        value=0.5,
                         step=0.05,
+                        label="Similarity Threshold (minimum score - filters low-quality matches)"
                     )
             # Tab 3: Generation Configuration
                         label="Temperature (creativity)"
                     )
                     max_tokens = gr.Slider(
+                        minimum=100,
+                        maximum=2048,
+                        value=800,
                         step=50,
+                        label="Max Tokens (response length - higher for reasoning models)"
                     )
             # Tab 4: Query & Results

rag_system.py CHANGED Viewed

@@ -326,28 +326,43 @@ Question: {query}
 Answer:"""
-        # Generate response using chat completion
         try:
-            messages = [
-                {
-                    "role": "user",
-                    "content": prompt
-                }
-            ]
-            response = self.llm_client.chat_completion(
-                messages=messages,
-                max_tokens=max_tokens,
-                temperature=temperature,
-            )
-            # Extract answer from response
-            if hasattr(response, 'choices') and len(response.choices) > 0:
-                answer = response.choices[0].message.content.strip()
-            elif isinstance(response, dict) and 'choices' in response:
-                answer = response['choices'][0]['message']['content'].strip()
-            else:
-                answer = str(response).strip()
             # Handle reasoning tokens (for models like Qwen)
             answer = self._process_reasoning_output(answer)
@@ -361,14 +376,19 @@ Answer:"""
     def _process_reasoning_output(self, text: str) -> str:
         """Process output from reasoning models to separate thinking from answer"""
         # Common patterns for reasoning models
-        # Qwen uses <think>...</think> tags
-        if '<think>' in text and '</think>' in text:
-            # Extract reasoning and answer
-            reasoning_match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
             if reasoning_match:
                 reasoning = reasoning_match.group(1).strip()
-                answer = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
                 return f"""**Answer:**
@@ -383,6 +403,37 @@ Answer:"""
 {reasoning}
 ```
 </details>"""
         # Alternative pattern: text before "Answer:" or similar markers
@@ -393,6 +444,7 @@ Answer:"""
                 answer = ''.join(parts[2:]).strip()
                 if reasoning and len(reasoning) > 50:  # Only if there's substantial reasoning
                     return f"""**Answer:**
 {answer}
@@ -409,6 +461,7 @@ Answer:"""
 </details>"""
         # No reasoning pattern found, return as is
         return text
     def generate_example_questions(self, num_questions: int = 5) -> List[str]:
@@ -438,21 +491,35 @@ Text excerpts:
 Generate exactly {num_questions} questions, one per line, without numbering:"""
-            messages = [{"role": "user", "content": prompt}]
-            response = self.llm_client.chat_completion(
-                messages=messages,
-                max_tokens=300,
-                temperature=0.8,
-            )
-            # Extract questions
-            if hasattr(response, 'choices') and len(response.choices) > 0:
-                questions_text = response.choices[0].message.content.strip()
-            elif isinstance(response, dict) and 'choices' in response:
-                questions_text = response['choices'][0]['message']['content'].strip()
-            else:
-                questions_text = str(response).strip()
             # Clean up reasoning if present
             questions_text = re.sub(r'<think>.*?</think>', '', questions_text, flags=re.DOTALL)

 Answer:"""
+        # Generate response - try chat_completion first, fallback to text_generation
         try:
+            # Try chat_completion first
+            try:
+                messages = [
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
+                ]
+                response = self.llm_client.chat_completion(
+                    messages=messages,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                )
+                # Extract answer from response
+                if hasattr(response, 'choices') and len(response.choices) > 0:
+                    answer = response.choices[0].message.content.strip()
+                elif isinstance(response, dict) and 'choices' in response:
+                    answer = response['choices'][0]['message']['content'].strip()
+                else:
+                    answer = str(response).strip()
+            except Exception as chat_error:
+                # Fallback to text_generation
+                print(f"Chat completion failed, trying text_generation: {chat_error}")
+                response = self.llm_client.text_generation(
+                    prompt,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    return_full_text=False,
+                )
+                answer = response.strip() if isinstance(response, str) else str(response).strip()
             # Handle reasoning tokens (for models like Qwen)
             answer = self._process_reasoning_output(answer)
     def _process_reasoning_output(self, text: str) -> str:
         """Process output from reasoning models to separate thinking from answer"""
+        # Debug: print first 200 chars to see the format
+        print(f"[DEBUG] Processing output (first 200 chars): {text[:200]}")
         # Common patterns for reasoning models
+        # Qwen uses <think>...</think> tags (case-insensitive check)
+        if '<think>' in text.lower():
+            # Extract reasoning and answer (case-insensitive)
+            reasoning_match = re.search(r'<think>(.*?)</think>', text, re.DOTALL | re.IGNORECASE)
             if reasoning_match:
                 reasoning = reasoning_match.group(1).strip()
+                answer = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL | re.IGNORECASE).strip()
+                print(f"[DEBUG] Found reasoning tokens! Reasoning length: {len(reasoning)}, Answer length: {len(answer)}")
                 return f"""**Answer:**
 {reasoning}
 ```
+</details>"""
+        # Alternative pattern: Look for common thinking patterns in text
+        # Some models output their reasoning inline without special tags
+        thinking_patterns = [
+            r'(Let me think.*?(?:Answer:|Response:|Conclusion:))',
+            r'(Okay, let\'s see.*?(?:Answer:|Response:|Conclusion:))',
+            r'(First,.*?(?:Therefore,|Thus,|So,|In conclusion,))',
+        ]
+        for pattern in thinking_patterns:
+            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
+            if match:
+                reasoning = match.group(1).strip()
+                answer = text[match.end():].strip()
+                if len(reasoning) > 100 and len(answer) > 20:  # Substantial reasoning and answer
+                    print(f"[DEBUG] Found inline reasoning! Pattern matched.")
+                    return f"""**Answer:**
+{answer}
+---
+<details>
+<summary>🧠 Model Reasoning (click to expand)</summary>
+```
+{reasoning}
+```
 </details>"""
         # Alternative pattern: text before "Answer:" or similar markers
                 answer = ''.join(parts[2:]).strip()
                 if reasoning and len(reasoning) > 50:  # Only if there's substantial reasoning
+                    print(f"[DEBUG] Found Answer: marker pattern")
                     return f"""**Answer:**
 {answer}
 </details>"""
         # No reasoning pattern found, return as is
+        print(f"[DEBUG] No reasoning pattern found, returning as-is")
         return text
     def generate_example_questions(self, num_questions: int = 5) -> List[str]:
 Generate exactly {num_questions} questions, one per line, without numbering:"""
+            # Try chat_completion first, fallback to text_generation
+            try:
+                messages = [{"role": "user", "content": prompt}]
+                response = self.llm_client.chat_completion(
+                    messages=messages,
+                    max_tokens=300,
+                    temperature=0.8,
+                )
+                # Extract questions
+                if hasattr(response, 'choices') and len(response.choices) > 0:
+                    questions_text = response.choices[0].message.content.strip()
+                elif isinstance(response, dict) and 'choices' in response:
+                    questions_text = response['choices'][0]['message']['content'].strip()
+                else:
+                    questions_text = str(response).strip()
+            except Exception as chat_error:
+                print(f"Chat completion failed for questions, trying text_generation: {chat_error}")
+                response = self.llm_client.text_generation(
+                    prompt,
+                    max_new_tokens=300,
+                    temperature=0.8,
+                    return_full_text=False,
+                )
+                questions_text = response.strip() if isinstance(response, str) else str(response).strip()
             # Clean up reasoning if present
             questions_text = re.sub(r'<think>.*?</think>', '', questions_text, flags=re.DOTALL)