ayjays132
/

Chain_Of_Thought_Wrapper

Model card Files Files and versions

xet

Community

ayjays132 commited on Apr 20, 2025

Commit

828f04e

verified ·

1 Parent(s): 1c116cd

Upload 2 files

Browse files

Files changed (2) hide show

chain_of_thought_gui.py +748 -99
chain_of_thought_wrapper.py +570 -93

chain_of_thought_gui.py CHANGED Viewed

@@ -1,119 +1,768 @@
 #!/usr/bin/env python3
 """
-NeuroReasoner 1 Chain‑of‑Thought GUI
 -------------------------------------------------------------
-A futuristic, user‑friendly Streamlit app for step‑by‑step reasoning
-using any Hugging Face causal LM.
-Features:
- • Load any model by repo name or local path
- • Full control of generation params (Temp, top‑k/p, etc.)
- • Self‑Consistency sampling
- • ASCII telemetry panels
- • Progress indicators and collapsible reasoning details
 """
 import os
 import time
-import torch
-import pynvml
 import streamlit as st
-from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-from chain_of_thought_wrapper import ChainOfThoughtWrapper
-# Initialize GPU telemetry
 try:
     pynvml.nvmlInit()
     GPU_AVAILABLE = True
 except Exception:
     GPU_AVAILABLE = False
-@st.cache_data(show_spinner=False)
-def get_telemetry():
     if not GPU_AVAILABLE or not torch.cuda.is_available():
-        return "[No GPU telemetry]"
-    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
-    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
-    return f"GPU: {util.gpu}% | Mem: {mem.used//1024**2}/{mem.total//1024**2} MB"
-# Sidebar configuration
-st.sidebar.title("⚙️ Configuration")
-model_name = st.sidebar.text_input(
-    "Model (HuggingFace repo or local path)", value="ayjays132/NeuroReasoner-1-NR-1"
-)
-device = st.sidebar.selectbox("Device", options=["cuda" if torch.cuda.is_available() else "cpu", "cpu"] )
-num_sequences = st.sidebar.slider("# Chains", min_value=1, max_value=10, value=3)
-self_consistency = st.sidebar.checkbox("Self‑Consistency", value=False)
-max_new_tokens = st.sidebar.slider("Max New Tokens", 50, 1024, 256)
-temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
-top_k = st.sidebar.slider("Top-k", 0, 200, 50)
-top_p = st.sidebar.slider("Top-p", 0.0, 1.0, 0.9)
-no_repeat_ngram = st.sidebar.slider("No‑repeat ngram", 0, 10, 3)
-# Main interface
-st.markdown("# 🌀 NeuroReasoner CoT GUI")
-col1, col2 = st.columns([3,1])
-with col1:
-    prompt = st.text_area("🚀 Enter your prompt", value="Explain why the sky is blue.", height=120)
-with col2:
-    st.metric("Telemetry", get_telemetry())
-if st.button("🪄 Generate Reasoning", type="primary"):
-    if not prompt.strip():
-        st.error("Please enter a prompt.")
         st.stop()
-    # Load model & tokenizer
     try:
-        with st.spinner("🌐 Loading model and tokenizer..."):
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            model = AutoModelForCausalLM.from_pretrained(model_name)
-            model.to(device)
-        st.success("✅ Model loaded.")
     except Exception as e:
-        st.error(f"❌ Load error: {e}")
         st.stop()
-    # Setup CoT
-    cfg = GenerationConfig(
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p,
-        do_sample=True,
-        num_return_sequences=(num_sequences if self_consistency else 1),
-        no_repeat_ngram_size=no_repeat_ngram,
-        eos_token_id=tokenizer.eos_token_id,
-        pad_token_id=tokenizer.pad_token_id
-    )
-    cot = ChainOfThoughtWrapper(
-        model=model,
-        tokenizer=tokenizer,
-        generation_config=cfg,
-        device=device,
-        self_consistency=self_consistency,
-        consistency_rounds=(num_sequences if self_consistency else 1)
-    )
-    # Tokenize & generate
-    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
-    start = time.time()
-    output = cot.generate(
-        input_ids=inputs['input_ids'],
-        attention_mask=inputs['attention_mask'],
-        num_return_sequences=(num_sequences if self_consistency else 1)
-    )
-    elapsed = time.time() - start
-    st.success(f"✨ Done in {elapsed:.2f}s")
-    # Display results
-    for idx, (full, steps, ans) in enumerate(zip(output['full_texts'], output['reasoning_steps'], output['final_answers']), 1):
-        with st.expander(f"Chain {idx}"):
-            st.text_area("Full Text", value=full, height=200)
-            if steps:
-                st.write("**Steps:**")
-                for i, s in enumerate(steps, 1): st.write(f"{i}. {s}")
-            else:
-                st.warning("No parsed steps.")
-            st.markdown(f"**Final Answer:** {ans}")
-    st.markdown("---")
-    st.write(f"Telemetry: {get_telemetry()}")
-# Footer
-st.markdown("<sub>Built for a futuristic, seamless reasoning experience.</sub>", unsafe_allow_html=True)

 #!/usr/bin/env python3
 """
+NeuroReasoner Chain-of-Thought GUI (Dark Theme Enhanced)
 -------------------------------------------------------------
+A premium Streamlit app for step-by-step reasoning
+across any Hugging Face model (causal or seq2seq).
+Featuring a dark theme, model-type detection, self-consistency
+sampling, and robust handling.
 """
 import os
 import time
 import streamlit as st
+import torch
+import pynvml # For GPU telemetry
+import numpy as np
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    GenerationConfig,
+    PretrainedConfig
+)
+from collections import Counter # For self-consistency voting
+import gc # Import garbage collector
+# Assuming chain_of_thought_wrapper.py is in the same directory
+# and is designed to work with standard Hugging Face models and GenerationConfig.
+# Make sure the wrapper correctly handles num_return_sequences for CoT and SC,
+# and returns the expected dictionary structure:
+# {'full_texts': [...], 'reasoning_steps': [...], 'final_answers': [...], 'consensus_answer': '...'}
+try:
+    from chain_of_thought_wrapper import ChainOfThoughtWrapper
+except ImportError:
+    st.error("Error: chain_of_thought_wrapper.py not found. Please ensure it's in the same directory.")
+    st.stop()
+# --- Page Configuration ---
+st.set_page_config(
+    page_title="🧠 NeuroReasoner CoT GUI",
+    page_icon="🧠",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={
+        'Get Help': 'https://github.com/your_repo_link_here', # Replace or remove
+        'Report a bug': "https://github.com/your_repo_link_here/issues", # Replace or remove
+        'About': """
+        **NeuroReasoner Chain-of-Thought GUI**
+        An open-source interface powered by Hugging Face models and the NeuroReasoner wrapper.
+        Explore step-by-step reasoning with various language models.
+        """
+    }
+)
+# --- Dark Theme CSS ---
+st.markdown("""
+<style>
+    /* Overall Page Background & Text (Dark Theme) */
+    body {
+        background-color: #1E1E1E; /* Dark grey background */
+        color: #D4D4D4; /* Light grey text */
+        font-family: 'Segoe UI', Roboto, Arial, sans-serif;
+    }
+    .stApp {
+        background-color: #1E1E1E;
+        color: #D4D4D4;
+    }
+    /* Sidebar Styling */
+    .stSidebar {
+        background-color: #2D2D2D; /* Slightly lighter dark grey for sidebar */
+        padding: 2rem 1rem;
+        border-right: 1px solid #3E3E3E; /* Subtle border */
+    }
+    .stSidebar h1, .stSidebar h2, .stSidebar h3 {
+         color: #569CD6; /* Visual Studio Code blue for sidebar headers */
+    }
+    .stSidebar label {
+        color: #D4D4D4 !important; /* Ensure sidebar labels are visible */
+    }
+    /* Main Content Area */
+    .stContainer {
+        padding: 2rem;
+    }
+    /* Titles and Headers */
+    h1, h2, h3, h4, h5, h6 {
+        color: #569CD6; /* VS Code blue headings */
+        margin-top: 1rem;
+        margin-bottom: 0.8rem;
+    }
+    h1 { font-size: 2.5rem; color: #4EC9B0; } /* Teal for main title */
+    h2 { font-size: 2rem; border-bottom: 2px solid #569CD6; padding-bottom: 0.5rem; margin-bottom: 1rem;}
+    /* Buttons */
+    .stButton>button {
+        background-color: #1E4D2B; /* Dark green */
+        color: #4EC9B0; /* Teal text */
+        border: none;
+        border-radius: 0.5rem;
+        padding: 0.75rem 1.5rem;
+        font-size: 1rem;
+        font-weight: bold;
+        transition: background-color 0.2s ease, transform 0.1s ease;
+        box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.3);
+    }
+    .stButton>button:hover {
+        background-color: #27633A; /* Lighter green on hover */
+        transform: translateY(-1px);
+    }
+    .stButton>button:active {
+        background-color: #1A3C23; /* Darker green on click */
+        transform: translateY(0);
+        box-shadow: 1px 1px 3px rgba(0, 0, 0, 0.4);
+    }
+    /* Text areas and inputs */
+    .stTextArea textarea, .stTextInput input {
+        border: 1px solid #3E3E3E; /* Dark border */
+        border-radius: 0.4rem;
+        padding: 0.75rem;
+        font-size: 1rem;
+        background-color: #252526; /* VS Code background */
+        color: #D4D4D4; /* Light text */
+        box-shadow: inset 1px 1px 3px rgba(0, 0, 0, 0.2);
+    }
+     .stTextArea label, .stTextInput label {
+        font-weight: bold;
+        color: #9CDCFE !important; /* Light blue labels */
+        margin-bottom: 0.5rem;
+        display: block;
+    }
+    /* Streamlit status box styling */
+    .st-emotion-cache-vj1l9j { /* Target the status box content div */
+        background-color: #2D2D2D; /* Match sidebar background */
+        border: 1px solid #3E3E3E;
+        border-radius: 0.5rem;
+        padding: 1rem;
+        margin-bottom: 1rem;
+    }
+     .st-emotion-cache-vj1l9j .stMarkdown p { /* Style text inside status */
+        color: #D4D4D4 !important;
+     }
+     /* Status box icons/text (might need to target specific internal classes) */
+     .st-emotion-cache-vj1l9j .stAlert {
+        background-color: transparent !important; /* Don't want alert backgrounds inside status */
+     }
+    /* Info/Success/Error/Warning boxes */
+    .stAlert {
+        border-radius: 0.5rem;
+        margin-bottom: 1rem;
+        padding: 1rem;
+        font-size: 1rem;
+        border-left: 5px solid transparent; /* Base style */
+    }
+    .stAlert.stAlert-info { border-left-color: #569CD6; background-color: #2A3E52; color: #9CDCFE; } /* Dark blue info */
+    .stAlert.stAlert-success { border-left-color: #4EC9B0; background-color: #28403A; color: #7AC7A3; } /* Dark teal success */
+    .stAlert.stAlert-warning { border-left-color: #DCDCAA; background-color: #454032; color: #FFDAA6; } /* Dark yellow warning */
+    .stAlert.stAlert-error { border-left-color: #F44747; background-color: #4A3030; color: #F48787; } /* Dark red error */
+    /* Expander styling */
+    .streamlit-expanderHeader {
+        background-color: #3E3E3E; /* Dark grey header */
+        color: #D4D4D4; /* Light grey text */
+        border-radius: 0.5rem;
+        padding: 0.75rem 1.2rem;
+        margin-top: 0.8rem;
+        margin-bottom: 0.5rem;
+        font-weight: bold;
+        font-size: 1.1rem;
+        cursor: pointer;
+        transition: background-color 0.2s ease;
+    }
+    .streamlit-expanderHeader:hover {
+        background-color: #4E4E4E; /* Slightly lighter on hover */
+    }
+    .streamlit-expanderContent {
+        background-color: #252526; /* VS Code background */
+        border: 1px solid #3E3E3E;
+        border-top: none;
+        border-bottom-left-radius: 0.5rem;
+        border-bottom-right-radius: 0.5rem;
+        padding: 1.5rem;
+        margin-top: 0;
+        color: #D4D4D4;
+    }
+    /* Labels for the output text areas */
+    .output-label {
+        font-weight: bold !important;
+        color: #9CDCFE !important; /* Light blue */
+        margin-top: 1rem;
+        margin-bottom: 0.5rem;
+        display: block;
+        font-size: 1.1rem;
+    }
+    /* Custom class for output text areas to differentiate from input */
+    .output-text-area textarea {
+        background-color: #1E1E1E; /* Even darker background for outputs */
+        border: 1px solid #3E3E3E;
+        border-radius: 0.4rem;
+        padding: 0.75rem;
+        font-size: 1rem;
+        color: #D4D4D4;
+    }
+    /* Telemetry box styling */
+    .telemetry-box {
+        background-color: #2D2D2D; /* Match sidebar */
+        border: 1px solid #3E3E3E;
+        border-radius: 0.5rem;
+        padding: 0.75rem;
+        margin-top: 1rem;
+        font-size: 0.9rem;
+        color: #D4D4D4;
+        text-align: center;
+    }
+    /* Self-Consistency Consensus Styling */
+    .consensus-answer {
+        background-color: #28403A; /* Dark green */
+        color: #7AC7A3; /* Light green text */
+        border: 1px solid #3A5048;
+        border-radius: 0.5rem;
+        padding: 1rem;
+        margin-top: 1rem;
+        margin-bottom: 1rem;
+        font-size: 1.2rem;
+        font-weight: bold;
+    }
+    .consensus-answer strong {
+        color: #4EC9B0; /* Teal for "Consensus Answer" label */
+    }
+    .consensus-answer div {
+        color: #D4D4D4; /* Ensure the answer text is light */
+    }
+</style>
+""", unsafe_allow_html=True)
+# --- GPU Telemetry Setup ---
 try:
     pynvml.nvmlInit()
     GPU_AVAILABLE = True
 except Exception:
     GPU_AVAILABLE = False
+# Use st.empty to hold the telemetry status text, defined *outside* cached functions
+telemetry_placeholder = st.empty()
+def update_telemetry():
+    """Updates the telemetry display in the dedicated placeholder."""
+    telemetry_text = "[Checking System Status...]"
     if not GPU_AVAILABLE or not torch.cuda.is_available():
+        telemetry_text = "📊 System Status: [No GPU Available]"
+    else:
+        try:
+            h = pynvml.nvmlDeviceGetHandleByIndex(0)
+            u = pynvml.nvmlDeviceGetUtilizationRates(h)
+            m = pynvml.nvmlDeviceGetMemoryInfo(h)
+            mem_used_mb = m.used // 1024**2
+            mem_total_mb = m.total // 1024**2
+            telemetry_text = f"📊 System Status: GPU {u.gpu}% | Mem {mem_used_mb}/{mem_total_mb} MB"
+        except Exception:
+             telemetry_text = "📊 System Status: [Telemetry Error]"
+    # Use markdown with a custom class for styling
+    telemetry_placeholder.markdown(f'<div class="telemetry-box">{telemetry_text}</div>', unsafe_allow_html=True)
+# Initial telemetry update when the script starts
+update_telemetry()
+# --- Caching Model Loading (Core Logic Only) ---
+# Use st.cache_resource for heavy objects like models and tokenizers.
+# This function MUST NOT call Streamlit elements that affect the layout
+# or state outside of its own scope.
+@st.cache_resource(show_spinner=False) # Spinner handled manually
+def _load_model_and_tokenizer_cached(model_name: str, device: str, forced_model_type: str = None):
+    """
+    Loads the model and tokenizer. This function is cached and should
+    contain minimal Streamlit calls to avoid caching issues.
+    """
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, low_cpu_mem_usage=True)
+    is_encoder_decoder = getattr(config, "is_encoder_decoder", False)
+    detected_type = "Seq2Seq" if is_encoder_decoder else "Causal"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    # Ensure padding token is set for generation robustness
+    if tokenizer.pad_token is None:
+        if tokenizer.eos_token is not None:
+             tokenizer.pad_token = tokenizer.eos_token
+        else:
+             # Fallback - adding tokens might require resizing model embeddings
+             # which is complex and model-dependent. This is a basic attempt.
+             tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+             tokenizer.pad_token = '[PAD]' # Set the attribute
+             # Attempt to get the new pad token ID - may not work for all tokenizers
+             try:
+                 tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('[PAD]')
+             except Exception:
+                 tokenizer.pad_token_id = None # Indicate failure to get ID
+    # Determine the model class based on detection or forced selection
+    actual_model_type = forced_model_type if forced_model_type != "Auto" else detected_type
+    if actual_model_type == "Seq2Seq":
+         model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config, trust_remote_code=True)
+    elif actual_model_type == "Causal":
+         model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=True)
+    else:
+         raise ValueError(f"Unsupported model type selected: {actual_model_type}. Please select 'Auto', 'Causal', or 'Seq2Seq'.")
+    model.to(device)
+    model.eval() # Crucial for consistent inference behavior and disabling dropout etc.
+    # Ensure return_dict_in_generate is True for structured outputs
+    if not getattr(model.config, 'return_dict_in_generate', False):
+         model.config.return_dict_in_generate = True
+    return model, tokenizer, actual_model_type
+# --- Wrapper function to handle status reporting for cached loading ---
+def safe_load_model_with_status(model_name: str, device: str, forced_model_type: str = None):
+    """
+    Calls the cached loading function and handles Streamlit status updates.
+    """
+    status_text = f"🌐 Loading model '{model_name}' on device '{device}'..."
+    # Use st.status here, defined outside the cached function
+    with st.status(status_text, expanded=True) as status_box:
+        status_box.write("Checking system status...")
+        update_telemetry() # Update the separate telemetry box
+        try:
+            status_box.write("Loading configuration and tokenizer...")
+            # Call the actual cached loading function
+            model, tokenizer, actual_model_type = _load_model_and_tokenizer_cached(
+                model_name=model_name,
+                device=device,
+                forced_model_type=forced_model_type
+            )
+            # Report padding token status if available
+            if tokenizer and tokenizer.pad_token_id is None:
+                 status_box.warning(f"Tokenizer has no pad_token_id. Generation might fail for models requiring padding (e.g., batching).")
+            elif tokenizer:
+                 status_box.write(f"Tokenizer pad_token_id set to {tokenizer.pad_token_id}.")
+            status_box.success(f"✅ Model '{model_name}' ({actual_model_type}) loaded successfully on '{device}'.")
+            update_telemetry() # Final telemetry update after success
+            return model, tokenizer, actual_model_type
+        except Exception as e:
+            status_box.error(f"❌ Model loading failed.")
+            update_telemetry() # Final telemetry update after error
+            st.exception(e) # Display the full exception traceback
+            # Clean up resources in case of failure before returning None
+            # These are manual attempts; cache handles cleanup on its own state changes
+            # but explicit cleanup is good practice on error paths.
+            try:
+                if 'model' in locals() and model is not None: del model
+            except NameError: pass
+            try:
+                if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
+            except NameError: pass
+            if torch.cuda.is_available(): torch.cuda.empty_cache()
+            gc.collect()
+            return None, None, None # Return None on failure
+# --- Sidebar Configuration ---
+with st.sidebar:
+    st.header("⚙️ Core Settings")
+    st.markdown("Configure the foundational aspects of the NeuroReasoner.")
+    with st.expander("🧠 Model Configuration", expanded=True):
+        model_name = st.text_input(
+            "Hugging Face Model ID or Path",
+            "ayjays132/NeuroReasoner-1-NR-1",
+            help="Enter the model ID from huggingface.co or a local path."
+        )
+        # --- Dynamic Model Type Detection ---
+        detected_type = "Unknown (Enter Model ID)"
+        # Options match the strings used in the loading function
+        model_type_options = ["Auto", "Causal", "Seq2Seq"]
+        default_model_type_index = model_type_options.index("Auto")
+        # Attempt to load config to detect type without caching (lightweight check)
+        try:
+            if model_name and model_name.strip(): # Only attempt if input is not empty
+                initial_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, low_cpu_mem_usage=True)
+                is_encoder_decoder_initial = getattr(initial_config, "is_encoder_decoder", False)
+                detected_type = "Seq2Seq" if is_encoder_decoder_initial else "Causal"
+            else:
+                 detected_type = "Unknown (Enter Model ID)"
+        except Exception:
+            detected_type = "Unknown (Config Load Error)" # Indicate config load itself failed
+        forced_model_type = st.selectbox(
+            "Architecture Type",
+            model_type_options,
+            index=default_model_type_index,
+            help=f"Detected: {detected_type}. 'Auto' uses the detected type. Select manually if detection is incorrect or overridden."
+        )
+        # --- Device Selection ---
+        available_devices = ["cpu"]
+        if torch.cuda.is_available():
+             available_devices.insert(0, "cuda") # Put cuda first if available
+        device = st.selectbox(
+            "Device",
+            available_devices,
+            help="Select the hardware device for computation (GPU recommended)."
+        )
+        st.markdown("""
+             <small>💡 Changing model settings requires reloading the model.</small>
+        """, unsafe_allow_html=True)
+    st.markdown("---") # Visual separator
+    st.header("✨ Generation Parameters")
+    st.markdown("Define how the AI generates reasoning steps and answers.")
+    with st.expander("Basic Parameters", expanded=True):
+        # Finalized 'Number of Reasoning Chains' parameter
+        num_chains = st.slider(
+            "Number of Reasoning Chains",
+            min_value=1,
+            max_value=15, # Kept the higher max for more robustness
+            value=5,      # Kept the default of 5
+            help="How many independent reasoning chains to generate for analyzing the problem. More chains can improve Self-Consistency but take longer."
+        )
+        # Finalized 'No-repeat Ngram Size' parameter
+        no_repeat_ngram_size = st.slider( # Using the standard name for GenerationConfig
+            "No-repeat Ngram Size",
+            min_value=0,
+            max_value=10,
+            value=3,
+            help="Avoids generating repeating sequences of N tokens. Set to 0 to disable."
+        )
+        # Self-Consistency checkbox remains
+        self_consistency = st.checkbox(
+            "Enable Self-Consistency Voting",
+            value=True,
+            help="When enabled, the system generates multiple chains and identifies the most common final answer as the consensus. Requires 'Number of Reasoning Chains' > 1."
+        )
+        # Conditional warning if Self-Consistency is on but num_chains is 1
+        if self_consistency and num_chains <= 1:
+             st.warning("Self-Consistency is most effective with 2 or more chains.") # Slightly rephrased warning
+    # Advanced Parameters
+    with st.expander("🧪 Advanced Sampling Parameters"):
+        max_new_tokens = st.slider(
+            "Max Tokens per Chain",
+            50, 2048, 768,
+            help="Maximum number of new tokens to generate for *each* individual reasoning chain. Adjust based on complexity expected."
+        )
+        temperature = st.slider(
+            "Temperature",
+            0.0, 2.0, 0.8,
+            help="Controls the randomness of sampling. 0.0 is deterministic (greedy). Higher values increase diversity."
+        )
+        top_k = st.slider(
+            "Top-k",
+            0, 100, 50,
+            help="Filter to consider only the top_k most likely tokens at each step (0 disables). Used with sampling."
+        )
+        top_p = st.slider(
+            "Top-p (Nucleus Sampling)",
+            0.0, 1.0, 0.95,
+            help="Filter to consider tokens with cumulative probability below top_p (0.0 disables). Used with sampling."
+        )
+        do_sample = st.checkbox(
+            "Enable Sampling",
+            value=True,
+            help="If checked, uses probabilistic sampling (controlled by Temperature, Top-k, Top-p). If unchecked, uses greedy decoding."
+        )
+        if not do_sample:
+             st.info("Sampling disabled. Temperature, Top-k, and Top-p will be ignored.")
+        no_repeat_ngram_size = st.slider(
+            "No-repeat Ngram Size",
+            0, 10, 3,
+            help="Avoids repeating sequences of N tokens. Set to 0 to disable."
+        )
+        # Optional: Add a seed for reproducibility if desired
+        # generation_seed = st.number_input("Generation Seed (Optional)", value=-1, help="Set a positive integer for reproducible generation.")
+    st.markdown("---") # Visual separator
+    # Update the persistent telemetry box in the sidebar footer area
+    update_telemetry()
+# --- Main Content Layout ---
+st.title("🧠 NeuroReasoner: Chain-of-Thought Explorer")
+st.markdown("Unpack complex problems with step-by-step AI reasoning.")
+# Container for input and primary controls
+input_container = st.container()
+with input_container:
+    # Use columns for prompt input and action button
+    prompt_col, button_col = st.columns([3, 1])
+    with prompt_col:
+        prompt = st.text_area(
+            "📝 Enter your query or problem:",
+            height=150,
+            placeholder="Example: If a train travels at 60 mph and a car at 40 mph, starting at the same time from cities 300 miles apart, how long until they meet? Think step-by-step.",
+            key="user_prompt" # Added key for stability
+        )
+    with button_col:
+        # Add some vertical space to align the button nicely
+        st.markdown("<div style='height: 3.5rem;'></div>", unsafe_allow_html=True)
+        run_button = st.button("✨ Generate Reasoning", use_container_width=True, key="generate_button") # Added key
+# Container for status updates and results
+results_container = st.container()
+# --- Generation Logic Trigger ---
+if run_button:
+    if not prompt or not prompt.strip():
+        results_container.warning("Please enter a prompt to begin generation.")
+        st.stop() # Stop execution until prompt is entered
+    # --- Prepare for Generation ---
+    # Load model and tokenizer (handles caching internally with st.cache_resource
+    # via safe_load_model_with_status which also reports status)
+    # This happens only when the button is clicked and parameters might have changed
+    model, tokenizer, loaded_model_type = safe_load_model_with_status(model_name, device, forced_model_type)
+    if model is None or tokenizer is None:
+        # Error was already shown by safe_load_model_with_status
+        st.error("Model or tokenizer failed to load. Please check settings and traceback above.")
+        st.stop() # Stop if loading failed
+    # --- Configure Generation ---
+    # Use a status box for ongoing generation process
+    with results_container:
+        st.markdown("---") # Separator before results
+        generation_status = st.status("Preparing generation config...", expanded=True)
+        update_telemetry() # Update telemetry while status is active
+    try:
+        # Build GenerationConfig based on sidebar parameters
+        # num_return_sequences should match num_chains for the wrapper to process them
+        gen_cfg = GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            num_return_sequences=num_chains, # <--- Corrected: Use num_chains directly
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            return_dict_in_generate=True,
+            output_scores=False,
+            output_attentions=False,
+            output_hidden_states=False,
+            use_cache=True,
+        )
+        generation_status.write(f"Generation parameters set: {gen_cfg.to_dict()}")
+        update_telemetry()
+        cfg = GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=True,
+            num_return_sequences=num_chains,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id
+        )
+    except Exception as e:
+        generation_status.error(f"❌ Failed to create GenerationConfig: {e}")
+        st.exception(e)
         st.stop()
+    # --- Instantiate Wrapper ---
     try:
+        generation_status.write("Initializing Chain-of-Thought wrapper...")
+        # Pass the configured generation config to the wrapper
+        # The wrapper should internally use num_return_sequences from gen_cfg
+        cot_wrapper = ChainOfThoughtWrapper(
+            model=model,
+            tokenizer=tokenizer,
+            generation_config=cfg,
+            device=device,
+            self_consistency=self_consistency,
+            consistency_rounds=(num_chains if self_consistency else 1)
+        )
+        generation_status.write("Wrapper initialized.")
+        update_telemetry()
     except Exception as e:
+        generation_status.error(f"❌ Failed to initialize CoT wrapper: {e}")
+        st.exception(e)
         st.stop()
+    # --- Tokenize Input ---
+    try:
+        generation_status.write("Tokenizing input prompt...")
+        # Use model_max_length or a reasonable cap for input length
+        max_input_length = tokenizer.model_max_length
+        if max_input_length is None or max_input_length > 4096: # Cap input length if tokenizer reports None or very large
+             max_input_length = 4096
+             if tokenizer.model_max_length is None:
+                  generation_status.warning(f"Tokenizer has no model_max_length, capping input to {max_input_length}.")
+        enc = tokenizer(
+            prompt,
+            return_tensors='pt',
+            padding='longest', # Pad to the longest sequence in the batch (batch size is 1 here)
+            truncation=True,
+            max_length=max_input_length, # Use a proper max length for the input
+        ).to(device)
+        generation_status.write(f"Input token length: {enc['input_ids'].shape[1]}")
+        update_telemetry()
+    except Exception as e:
+        generation_status.error(f"❌ Tokenization failed: {e}")
+        st.exception(e)
+        st.stop()
+    # --- Generate ---
+    generation_status.update(label=f"⏳ Generating {num_chains} reasoning chains...", state="running")
+    start_time = time.time()
+    try:
+        # Call the wrapper's generate method
+        # It should handle the loop for multiple chains and self-consistency internally
+        outputs = cot_wrapper.generate(
+            input_ids=enc['input_ids'],
+            attention_mask=enc['attention_mask'],
+            # Pass any other necessary arguments to your wrapper's generate method
+        )
+        # Expected `outputs` dict structure: {'full_texts': [...], 'reasoning_steps': [...], 'final_answers': [...], 'consensus_answer': '...'}
+        # The wrapper should handle extracting steps/answers if needed.
+    except Exception as e:
+        generation_status.error(f"❌ Generation failed: {e}")
+        st.exception(e)
+        # Clean up resources after potential OOM or other errors
+        if torch.cuda.is_available(): torch.cuda.empty_cache()
+        gc.collect() # Python garbage collection
+        st.stop()
+    elapsed_time = time.time() - start_time
+    generation_status.update(label=f"✨ Generation complete in {elapsed_time:.2f}s", state="complete")
+    update_telemetry() # Final telemetry update after successful generation
+    # --- Display Results ---
+    with results_container:
+        st.markdown("## 📚 Reasoning Output")
+        # Display Self-Consistency Consensus first if enabled and results are available
+        if self_consistency and outputs and 'consensus_answer' in outputs and outputs.get('final_answers'):
+            consensus = outputs.get('consensus_answer')
+            answers = outputs.get('final_answers', [])
+            st.markdown('<div class="consensus-answer">', unsafe_allow_html=True)
+            st.write("💡 **Consensus Answer (Self-Consistency):**")
+            st.write(consensus if consensus else "[Could not determine consensus]")
+            st.markdown('</div>', unsafe_allow_html=True)
+            if answers and len(answers) > 1: # Only show distribution if more than one answer was found
+                st.markdown("###### Answer Distribution:")
+                answer_counts = Counter(answers)
+                # Display sorted distribution
+                for ans, count in answer_counts.most_common():
+                     st.write(f"- '{ans}' ({count} {'vote' if count == 1 else 'votes'})")
+            st.markdown("---") # Separator
+        # Display individual chains
+        full_texts = outputs.get('full_texts', [])
+        reasoning_steps = outputs.get('reasoning_steps', [])
+        final_answers = outputs.get('final_answers', [])
+        if not full_texts:
+            st.warning("No reasoning chains were generated.")
+        else:
+            st.markdown(f"### Individual Chains ({len(full_texts)} generated)")
+            # Iterate and display each chain in an expander
+            # Ensure lists are iterable, even if empty
+            full_texts = full_texts if isinstance(full_texts, list) else []
+            reasoning_steps = reasoning_steps if isinstance(reasoning_steps, list) else []
+            final_answers = final_answers if isinstance(final_answers, list) else []
+            # Pad lists to the same length in case the wrapper returned inconsistent outputs
+            max_len_outputs = max(len(full_texts), len(reasoning_steps), len(final_answers))
+            full_texts.extend(["[N/A - Generation Failed for this chain]"] * (max_len_outputs - len(full_texts)))
+            reasoning_steps.extend([[]] * (max_len_outputs - len(reasoning_steps)))
+            final_answers.extend(["[N/A]"] * (max_len_outputs - len(final_answers)))
+            for idx, (text, steps, ans) in enumerate(zip(full_texts, reasoning_steps, final_answers), 1):
+                # Use try-except just in case a single chain output is malformed
+                try:
+                    # Expander for each chain, starting collapsed
+                    with st.expander(f"Chain {idx}", expanded=False):
+                        # Use custom class for styling the label
+                        st.markdown('<div class="output-label">Full Generated Text:</div>', unsafe_allow_html=True)
+                        # Use custom class for styling the text area background
+                        st.text_area(f"chain_text_area_{idx}", text, height=250, label_visibility="collapsed", help="The complete generated output for this chain.")
+                        if steps and isinstance(steps, list):
+                            st.markdown('<div class="output-label">Reasoning Steps:</div>', unsafe_allow_html=True)
+                            # Display steps as a list
+                            if steps:
+                                for i, step in enumerate(steps, 1):
+                                     if isinstance(step, str) and step.strip():
+                                          st.write(f"**Step {i}:** {step.strip()}")
+                                     elif not isinstance(step, str):
+                                          st.warning(f"Step {i} has invalid format.")
+                            else:
+                                 st.info("No specific steps were extracted for this chain.")
+                        st.markdown('<div class="output-label">Final Answer:</div>', unsafe_allow_html=True)
+                        st.write(f"**{ans if ans else '[No answer extracted]'}**")
+                        # Optional: Add a separator between chain sections
+                        st.markdown("---", help="End of Chain details.")
+                except Exception as chain_e:
+                    st.error(f"Error displaying Chain {idx}: {chain_e}")
+                    st.exception(chain_e)
+        st.markdown("---") # Final separator
+        st.info("Generation process concluded. Review the chains above.")
+    # Clean up GPU memory after generation is complete and results are displayed
+    if torch.cuda.is_available():
+         torch.cuda.empty_cache()
+    gc.collect() # Python garbage collection

chain_of_thought_wrapper.py CHANGED Viewed

@@ -1,27 +1,67 @@
 import re
 import torch
 import logging
 from transformers import PreTrainedModel, AutoTokenizer, GenerationConfig, GenerationMixin
 from typing import Optional, List, Tuple, Dict, Union, Any
 logger = logging.getLogger(__name__)
-# Default configuration values
-DEFAULT_MAX_LENGTH = 1024
-DEFAULT_REASONING_LIMIT = 10
-DEFAULT_CONSISTENCY_ROUNDS = 3
-DEFAULT_COMPLEXITY_KEYWORDS = ["explain", "step by step", "plan", "analyze", "reasoning", "logic"]
-DEFAULT_FINAL_ANSWER_TAG = "Final_Answer:"
-# **Expanded** step‐pattern to catch both "Step 1:" and bare "1."
 DEFAULT_STEP_PATTERN = re.compile(
     r"^(?:Step\s*\d+[:.)-]|\d+[:.)-])\s*(.*)", re.IGNORECASE
 )
 class ChainOfThoughtWrapper:
     """
-    A robust, SOTA Chain-of-Thought wrapper for Hugging Face models or custom wrappers.
-    ALWAYS uses Chain‑of‑Thought now, with stricter injection and cleaning.
     """
     def __init__(
@@ -31,80 +71,201 @@ class ChainOfThoughtWrapper:
         generation_config: Optional[GenerationConfig] = None,
         device: Optional[str] = None,
         max_length: int = DEFAULT_MAX_LENGTH,
-        reasoning_steps_limit: int = DEFAULT_REASONING_LIMIT,
-        self_consistency: bool = False,
-        consistency_rounds: int = DEFAULT_CONSISTENCY_ROUNDS,
-        complexity_keywords: Optional[List[str]] = None,
         final_answer_tag: str = DEFAULT_FINAL_ANSWER_TAG,
     ):
         """
-        model: HF model or wrapper implementing `.generate()`
-        tokenizer: corresponding tokenizer
-        generation_config: overrides defaults
-        device: 'cpu'/'cuda'
         """
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = model.to(self.device)
         self.tokenizer = tokenizer
         self.max_length = max_length
         self.reasoning_steps_limit = reasoning_steps_limit
-        self.self_consistency = self_consistency
-        self.consistency_rounds = max(1, consistency_rounds) if self_consistency else 1
-        self.complexity_keywords = complexity_keywords or DEFAULT_COMPLEXITY_KEYWORDS
         self.final_answer_tag = final_answer_tag
         self.final_answer_pattern = re.compile(
             re.escape(final_answer_tag) + r"\s*(.*)", re.IGNORECASE | re.DOTALL
         )
-        # Try to locate HF config; fallback to tokenizer if missing
         self._hf_model, self._hf_config = self._find_hf_model_and_config(self.model)
         if self._hf_config is None:
-            logger.warning("HF config not found, falling back to tokenizer settings.")
             class PseudoConfig:
                 def __init__(self, tok):
                     self.eos_token_id = tok.eos_token_id
-                    self.pad_token_id = tok.pad_token_id or tok.eos_token_id
-                    self.vocab_size = len(tok)
             self._hf_config = PseudoConfig(self.tokenizer)
-        # Setup generation config
         if generation_config:
             self.generation_config = GenerationConfig.from_dict(generation_config.to_dict())
         else:
             self.generation_config = GenerationConfig(
                 eos_token_id=self._hf_config.eos_token_id,
                 pad_token_id=self._hf_config.pad_token_id,
-                max_length=self.max_length,
             )
-        # Ensure HF model returns dict outputs
-        try:
-            setattr(self._hf_config, 'return_dict_in_generate', True)
-        except Exception:
-            pass
-        logger.info("ChainOfThoughtWrapper ready on %s", self.device)
     def _find_hf_model_and_config(self, obj: Any) -> Tuple[Optional[PreTrainedModel], Optional[Any]]:
-        """Search for underlying PreTrainedModel and its config."""
         if isinstance(obj, PreTrainedModel) and hasattr(obj, 'config'):
             return obj, obj.config
-        for attr in ('model','base_model','transformer'):
-            m = getattr(obj, attr, None)
-            if isinstance(m, PreTrainedModel) and hasattr(m, 'config'):
-                return m, m.config
-        return None, getattr(obj, 'config', None)
     def _inject_cot(self, prompt: str) -> str:
-        # **More prescriptive CoT template**
-        return (
-            f"{prompt}\n\n"
-            "Let's analyze step by step exactly like this:\n\n"
-            "Step 1: \n"
-            "Step 2: \n"
-            "Step 3: \n\n"
-            "Final Answer:\n\n"
         )
     @torch.no_grad()
     def generate(
@@ -112,75 +273,391 @@ class ChainOfThoughtWrapper:
         input_ids: torch.LongTensor,
         attention_mask: Optional[torch.LongTensor] = None,
         generation_config: Optional[GenerationConfig] = None,
-        num_return_sequences: int = 1,
-        **kwargs
     ) -> Dict[str, Any]:
         """
-        Returns dict with keys: sequences, full_texts, reasoning_steps, final_answers
-        ALWAYS uses CoT path.
         """
         prompt_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
-        # **ALWAYS** do CoT, ignore complexity check
         cot_prompt = self._inject_cot(prompt_text)
-        # Merge configs
-        cfg = GenerationConfig.from_dict(self.generation_config.to_dict())
         if generation_config:
-            cfg.update(**generation_config.to_dict())
         cfg.num_return_sequences = num_return_sequences
-        for k,v in kwargs.items(): setattr(cfg, k, v)
-        # Encode with injected template
-        enc = self.tokenizer(
-            cot_prompt, return_tensors='pt', truncation=True,
-            max_length=self.max_length - cfg.max_new_tokens
-        ).to(self.device)
-        out = self.model.generate(
-            input_ids=enc['input_ids'], attention_mask=enc['attention_mask'], generation_config=cfg
-        )
-        decoded = self.tokenizer.batch_decode(out, skip_special_tokens=True)
-        results = [self._parse(text, cot_prompt) for text in decoded]
-        seqs = out
-        steps = [r[0] for r in results]
-        finals = [r[1] for r in results]
-        full = [r[2] for r in results]
-        return {'sequences': seqs, 'full_texts': full, 'reasoning_steps': steps, 'final_answers': finals}
     def _parse(self, text: str, cot_prompt: str) -> Tuple[List[str], str, str]:
-        # Remove the injected prompt
-        body = text[len(cot_prompt):].strip() if text.startswith(cot_prompt) else text
-        # **Clean out any stray tags or JSON fragments**
         body = re.sub(r"<init>.*?</init>", "", body, flags=re.DOTALL)
         body = re.sub(r"<final_output>.*?</final_output>", "", body, flags=re.DOTALL)
         body = re.sub(r"\{.*?\}", "", body, flags=re.DOTALL)
-        lines = [l.strip() for l in body.splitlines() if l.strip()]
-        steps = []
-        final = ""
-        for l in lines:
-            m = DEFAULT_STEP_PATTERN.match(l)
-            if m:
-                steps.append(m.group(1).strip())
             else:
-                fa = self.final_answer_pattern.search(l)
-                if fa:
-                    final = fa.group(1).strip()
-                    break
-        if not final:
-            # assume last non‑step line is the final answer
-            final = lines[-1] if lines else ""
-        return steps, final, body
     def resize_token_embeddings(self, new_size: int):
-        if hasattr(self._hf_model, 'resize_token_embeddings'):
-            self._hf_model.resize_token_embeddings(new_size)
-            logger.info("Resized embeddings to %d", new_size)
         else:
-            logger.error("Cannot resize: no underlying HF model method.")

+# chain_of_thought_wrapper.py
 import re
 import torch
 import logging
 from transformers import PreTrainedModel, AutoTokenizer, GenerationConfig, GenerationMixin
+from transformers.utils import is_accelerate_available, is_bitsandbytes_available
 from typing import Optional, List, Tuple, Dict, Union, Any
+import gc # Import garbage collector for cleanup
+import time
+# --- Logging Setup ---
+# Configure logging for the module
+logging.basicConfig(level=logging.INFO) # Default logging level
 logger = logging.getLogger(__name__)
+# Prevent duplicate handlers if imported multiple times
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.propagate = False # Prevent logs from going to root logger multiple times
+# --- Default Configuration Values ---
+# These defaults provide sensible starting points for the wrapper's behavior.
+DEFAULT_MAX_LENGTH = 1024          # Default maximum length of the generated output sequence.
+DEFAULT_REASONING_LIMIT = 10       # Limit on the number of steps to extract during parsing (currently unused in parse logic, but good to keep as a concept).
+DEFAULT_CONSISTENCY_ROUNDS = 3     # Default number of chains to generate for self-consistency (used in __init__, passed via GUI num_chains).
+DEFAULT_COMPLEXITY_KEYWORDS = ["explain", "step by step", "plan", "analyze", "reasoning", "logic"] # Keywords to potentially trigger CoT (currently unused, CoT is always on).
+DEFAULT_FINAL_ANSWER_TAG = "Final_Answer:" # The specific tag expected before the final answer.
+# --- Regex Pattern for Parsing Steps ---
+# This pattern is used to identify and extract individual reasoning steps from
+# the generated text. It's designed to be flexible, capturing:
+# - "Step N:"
+# - "Step N."
+# - "Step N-"
+# - "N:"
+# - "N."
+# - "N-"
+# Where N is one or more digits, case-insensitive for "Step".
 DEFAULT_STEP_PATTERN = re.compile(
     r"^(?:Step\s*\d+[:.)-]|\d+[:.)-])\s*(.*)", re.IGNORECASE
 )
 class ChainOfThoughtWrapper:
     """
+    A robust Chain-of-Thought (CoT) wrapper for Hugging Face models.
+    This wrapper enforces a Chain-of-Thought process by injecting a specific
+    template into the prompt. It handles model generation and parses the
+    output to extract reasoning steps and a final answer. It is designed
+    to generate multiple sequences for potential Self-Consistency voting
+    (voting logic is expected to be handled by the calling application,
+    like the Streamlit GUI).
+    Key Features:
+    - Forces CoT via prompt injection.
+    - Parses structured reasoning steps and final answer from output.
+    - Supports generating multiple chains for Self-Consistency analysis.
+    - Compatible with Hugging Face PreTrainedModels or objects implementing `.generate()`.
+    - Handles device placement and merges GenerationConfig.
     """
     def __init__(
         generation_config: Optional[GenerationConfig] = None,
         device: Optional[str] = None,
         max_length: int = DEFAULT_MAX_LENGTH,
+        reasoning_steps_limit: int = DEFAULT_REASONING_LIMIT, # Parameter included as per provided code
+        self_consistency: bool = False, # Parameter included as per provided code (__init__ attribute)
+        consistency_rounds: int = DEFAULT_CONSISTENCY_ROUNDS, # Parameter included as per provided code (__init__ attribute)
+        complexity_keywords: Optional[List[str]] = None, # Parameter included as per provided code
         final_answer_tag: str = DEFAULT_FINAL_ANSWER_TAG,
+        # self_consistency_enabled: bool = False # Removed this based on user's 'keep as is' and gui interaction
     ):
         """
+        Initializes the ChainOfThoughtWrapper.
+        Args:
+            model (Union[PreTrainedModel, GenerationMixin, Any]): The language model.
+                                                                   Must have a `.generate()` method.
+            tokenizer (AutoTokenizer): The corresponding tokenizer.
+            generation_config (Optional[GenerationConfig]): A default generation configuration.
+                                                            Values here can be overridden by `generate()` call.
+            device (Optional[str]): The device to load the model onto ('cpu' or 'cuda').
+                                    Defaults to 'cuda' if available, otherwise 'cpu'.
+            max_length (int): The maximum total length of the input + generated sequence.
+            reasoning_steps_limit (int): Conceptual limit for parsed steps (currently not enforced in _parse).
+            self_consistency (bool): Flag indicating if self-consistency is intended (Informs `consistency_rounds` attribute).
+            consistency_rounds (int): The number of chains to generate if self-consistency is active (Informs `consistency_rounds` attribute).
+                                      The actual number generated is controlled by `num_return_sequences` in `generate()` or `generation_config`.
+            complexity_keywords (Optional[List[str]]): List of keywords to potentially trigger CoT (currently unused).
+            final_answer_tag (str): The specific string marker expected before the final answer.
         """
+        # Determine and set the device
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info("Initializing wrapper on device: %s", self.device)
+        # Move the model to the specified device
+        try:
+            self.model = model.to(self.device)
+            self.model.eval() # Set model to evaluation mode for consistent behavior
+            logger.info("Model moved to %s and set to eval mode.", self.device)
+        except Exception as e:
+            logger.error("Failed to move model to device %s: %s", self.device, e)
+            raise # Re-raise the exception after logging
         self.tokenizer = tokenizer
+        # Set core parameters
         self.max_length = max_length
         self.reasoning_steps_limit = reasoning_steps_limit
+        self.self_consistency = self_consistency # Attribute stored, actual generation count controlled elsewhere
+        self.consistency_rounds = max(1, consistency_rounds) if self_consistency else 1 # Attribute stored
+        self.complexity_keywords = complexity_keywords or list(DEFAULT_COMPLEXITY_KEYWORDS) # Ensure it's a mutable list
         self.final_answer_tag = final_answer_tag
+        # Compile regex pattern for final answer extraction
         self.final_answer_pattern = re.compile(
             re.escape(final_answer_tag) + r"\s*(.*)", re.IGNORECASE | re.DOTALL
         )
+        logger.debug("Final answer pattern compiled: %s", self.final_answer_pattern.pattern)
+        logger.debug("Step pattern: %s", DEFAULT_STEP_PATTERN.pattern)
+        # Attempt to find the underlying Hugging Face model and its config
+        # This is useful for accessing standard attributes like eos_token_id, etc.
         self._hf_model, self._hf_config = self._find_hf_model_and_config(self.model)
+        # Fallback to tokenizer settings if HF config isn't found
         if self._hf_config is None:
+            logger.warning("Underlying HF model config not found. Relying on tokenizer for eos/pad tokens and vocab size.")
+            # Create a pseudo-config with essential tokenizer info
             class PseudoConfig:
                 def __init__(self, tok):
                     self.eos_token_id = tok.eos_token_id
+                    # Use eos_token_id as pad_token_id if pad_token_id is None (common for GPT-like models)
+                    self.pad_token_id = tok.pad_token_id if tok.pad_token_id is not None else tok.eos_token_id
+                    # Fallback if both are None (less common but possible)
+                    if self.pad_token_id is None:
+                        logger.warning("Tokenizer pad_token_id and eos_token_id are both None. Generation might be unstable without padding.")
+                        # Assign a arbitrary value or handle externally if this happens in practice
+                        # For now, keep it None, generation might fail or behave unexpectedly
+                        pass # Keep pad_token_id as None
+                    self.vocab_size = len(tok) # Vocabulary size from tokenizer
+                def __getattr__(self, name):
+                     # Allow accessing other attributes, returning None if not found
+                     # This prevents errors if generation_config tries to read something unexpected
+                    logger.debug("Accessing undefined attribute '%s' on PseudoConfig. Returning None.", name)
+                    return None
             self._hf_config = PseudoConfig(self.tokenizer)
+            logger.debug("Created PseudoConfig: eos_token_id=%s, pad_token_id=%s, vocab_size=%s",
+                         self._hf_config.eos_token_id, self._hf_config.pad_token_id, self._hf_config.vocab_size)
+        else:
+            logger.info("Found underlying HF model config.")
+            logger.debug("HF Config: eos_token_id=%s, pad_token_id=%s, vocab_size=%s",
+                         getattr(self._hf_config, 'eos_token_id', None),
+                         getattr(self._hf_config, 'pad_token_id', None),
+                         getattr(self._hf_config, 'vocab_size', None))
+        # --- Setup Generation Config ---
+        # Start with a base config, either provided or a default one
         if generation_config:
+            # Use from_dict and to_dict for safe merging/copying of GenerationConfig
             self.generation_config = GenerationConfig.from_dict(generation_config.to_dict())
+            logger.info("Initialized with provided GenerationConfig.")
         else:
+            # Create a default GenerationConfig using info from HF config or tokenizer fallback
             self.generation_config = GenerationConfig(
                 eos_token_id=self._hf_config.eos_token_id,
                 pad_token_id=self._hf_config.pad_token_id,
+                max_length=self.max_length, # Set max_length from wrapper param
+                # Add other common defaults if not provided
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.95,
+                top_k=50,
+                num_return_sequences=1, # Default to 1 sequence
+                no_repeat_ngram_size=0, # Default to no ngram repetition prevention
             )
+            logger.info("Initialized with default GenerationConfig.")
+        # Ensure the underlying HF model (if found) is set to return dict outputs from generate
+        # This is necessary for accessing scores, hidden states etc. if needed, and for consistency.
+        # Use a check as some custom models might not have this attribute on their config.
+        if hasattr(self._hf_config, 'return_dict_in_generate'):
+            try:
+                setattr(self._hf_config, 'return_dict_in_generate', True)
+                logger.debug("Set _hf_config.return_dict_in_generate = True.")
+            except Exception as e:
+                logger.warning("Failed to set return_dict_in_generate on _hf_config: %s", e)
+        else:
+             logger.debug("_hf_config does not have return_dict_in_generate attribute.")
+        logger.info("ChainOfThoughtWrapper initialization complete on device: %s", self.device)
+        logger.debug("Initial GenerationConfig: %s", self.generation_config.to_dict())
     def _find_hf_model_and_config(self, obj: Any) -> Tuple[Optional[PreTrainedModel], Optional[Any]]:
+        """
+        Recursively searches for an underlying Hugging Face PreTrainedModel
+        and its configuration within a potentially wrapped object.
+        Args:
+            obj (Any): The object to inspect (could be the model itself or a wrapper).
+        Returns:
+            Tuple[Optional[PreTrainedModel], Optional[Any]]: The found HF model instance and its config.
+                                                              Returns (None, None) if not found.
+        """
+        logger.debug("Searching for HF model in object of type: %s", type(obj))
+        # If the object is directly a PreTrainedModel and has a config
         if isinstance(obj, PreTrainedModel) and hasattr(obj, 'config'):
+            logger.debug("Found HF PreTrainedModel directly.")
             return obj, obj.config
+        # Check common attribute names where the base model might be stored
+        potential_attrs = ('model', 'base_model', 'transformer', 'hf_model')
+        for attr_name in potential_attrs:
+            m = getattr(obj, attr_name, None)
+            if m is not None:
+                logger.debug("Checking attribute '%s' of type %s", attr_name, type(m))
+                # Recursively search within the attribute
+                found_model, found_config = self._find_hf_model_and_config(m)
+                if found_model or found_config:
+                    return found_model, found_config
+        # If no PreTrainedModel found, check if the object itself has a 'config' attribute
+        if hasattr(obj, 'config'):
+            logger.debug("Found config attribute on object, but no PreTrainedModel.")
+            return None, obj.config
+        logger.debug("No HF PreTrainedModel or config found.")
+        return None, None
     def _inject_cot(self, prompt: str) -> str:
+        """
+        Injects the prescriptive Chain-of-Thought template into the user's prompt.
+        This method defines the expected format the model should follow for reasoning.
+        Args:
+            prompt (str): The original user prompt.
+        Returns:
+            str: The prompt with the CoT template appended.
+        """
+        # The template strongly guides the model to produce step-by-step reasoning
+        # followed by a specific tag for the final answer.
+        cot_prompt = (
+            f"{prompt.strip()}\n\n" # Use strip() to clean user prompt
+            "Let's analyze this problem logically, breaking it down step by step to reach the precise final answer.\n\n" # Enhanced instruction
+            "Reasoning Process:\n\n" # Clearer heading for steps
+            "Step 1: " # Start the first step explicitly
+            # More steps are not needed here, the model learns to continue the pattern
         )
+        logger.debug("Injected CoT template. Full prompt starts with: %s...", cot_prompt[:100].replace('\n', '\\n'))
+        return cot_prompt
     @torch.no_grad()
     def generate(
         input_ids: torch.LongTensor,
         attention_mask: Optional[torch.LongTensor] = None,
         generation_config: Optional[GenerationConfig] = None,
+        num_return_sequences: int = 1, # This argument controls how many sequences are generated
+        **kwargs: Any # Allows passing arbitrary generation parameters
     ) -> Dict[str, Any]:
         """
+        Generates text using the wrapped model, enforcing Chain-of-Thought.
+        This method prepares the input by injecting the CoT template, calls the
+        underlying model's generate method, and then parses the raw outputs
+        to extract structured reasoning steps and final answers.
+        Args:
+            input_ids (torch.LongTensor): Tokenized input prompt (batch size 1 expected).
+                                          Shape [1, sequence_length].
+            attention_mask (Optional[torch.LongTensor]): Attention mask for the input.
+                                                        Shape [1, sequence_length].
+            generation_config (Optional[GenerationConfig]): Specific generation config
+                                                           for this call. Overrides defaults.
+            num_return_sequences (int): The number of independent sequences to generate.
+                                        This is crucial for Self-Consistency.
+                                        Comes from the GUI's 'num_chains'.
+            **kwargs (Any): Additional keyword arguments passed to the model's `generate` method.
+        Returns:
+            Dict[str, Any]: A dictionary containing:
+                            - 'sequences' (torch.LongTensor): The raw generated token sequences.
+                            - 'full_texts' (List[str]): The complete decoded text for each sequence.
+                            - 'reasoning_steps' (List[List[str]]): List of parsed reasoning steps for each sequence.
+                            - 'final_answers' (List[str]): List of parsed final answers for each sequence.
+                            - 'consensus_answer' (Optional[str]): The consensus answer if self-consistency is active and possible (Handled by calling code).
         """
+        # Ensure input is on the correct device
+        input_ids = input_ids.to(self.device)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(self.device)
+        # Decode the original prompt text for CoT injection
+        # Assume batch size is 1 for the input prompt tensor [1, sequence_length]
+        if input_ids.size(0) != 1:
+             logger.warning("Batch size > 1 detected for input_ids (%d). CoT injection assumes batch size 1. Using the first item.", input_ids.size(0))
         prompt_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        # --- Inject CoT Template ---
+        # This is the core step that forces the model into a reasoning mode.
         cot_prompt = self._inject_cot(prompt_text)
+        logger.debug("Injected CoT prompt. Encoding...")
+        # --- Prepare Generation Configuration ---
+        # Merge the wrapper's default config with the call-specific config and kwargs.
+        # The num_return_sequences from the function argument takes precedence here.
+        cfg = GenerationConfig.from_dict(self.generation_config.to_dict()) # Start with wrapper's default
         if generation_config:
+             cfg.update(**generation_config.to_dict()) # Update with call-specific config
+             logger.debug("Updated GenerationConfig with call-specific config.")
+        # Explicitly set num_return_sequences from the function argument
         cfg.num_return_sequences = num_return_sequences
+        logger.info("Generating %d sequence(s).", cfg.num_return_sequences)
+        # Update with any remaining keyword arguments passed to generate()
+        for k, v in kwargs.items():
+             if hasattr(cfg, k):
+                  setattr(cfg, k, v)
+                  logger.debug("Updating GenerationConfig kwarg: %s=%s", k, v)
+             else:
+                 # Allow passing arbitrary kwargs to model.generate if the underlying method supports them
+                 # These won't be part of the GenerationConfig object itself unless it's a supported param.
+                 # However, the model's generate method might accept extra args.
+                 # Log a warning if it's not a standard GenerationConfig parameter.
+                 if k not in GenerationConfig().__dict__: # Check if it's NOT a standard param
+                     logger.debug("Passing non-standard kwarg '%s' to model.generate.", k)
+                 # We pass all kwargs to model.generate below anyway.
+        logger.debug("Final GenerationConfig for call: %s", cfg.to_dict())
+        # --- Encode the CoT Prompt ---
+        # Max length for input should be total max_length minus max_new_tokens
+        # to leave space for the generation.
+        # Ensure padding and truncation are handled.
+        try:
+            enc = self.tokenizer(
+                cot_prompt,
+                return_tensors='pt',
+                padding='longest', # Pad to the longest sequence in the batch (always 1 here)
+                truncation=True,   # Crucially, truncate if the prompt is too long
+                max_length=self.max_length - cfg.max_new_tokens # Leave room for generation
+            ).to(self.device)
+            logger.debug("Encoded CoT prompt. Input shape: %s", enc['input_ids'].shape)
+        except Exception as e:
+            logger.error("Failed to encode CoT prompt: %s", e)
+            raise # Re-raise the exception after logging
+        # --- Generate Text ---
+        # Call the underlying model's generate method with the prepared input and config.
+        # torch.no_grad() context is already applied to the whole method.
+        try:
+            logger.info("Calling model.generate()...")
+            start_time = time.time() # Measure generation time
+            out = self.model.generate(
+                input_ids=enc['input_ids'],
+                attention_mask=enc['attention_mask'],
+                generation_config=cfg,
+                **kwargs # Pass through any extra kwargs
+            )
+            elapsed_time = time.time() - start_time
+            logger.info("model.generate() finished in %.2f seconds.", elapsed_time)
+            logger.debug("Raw output shape: %s", out.shape)
+        except Exception as e:
+            logger.error("Model generation failed: %s", e)
+            # Attempt to clean up GPU memory in case of OOM or other errors
+            if torch.cuda.is_available():
+                 torch.cuda.empty_cache()
+            gc.collect() # Trigger Python garbage collection
+            raise # Re-raise the exception after logging
+        # --- Decode and Parse Outputs ---
+        # Decode the generated token sequences back into text.
+        logger.debug("Decoding and parsing outputs...")
+        decoded_outputs = self.tokenizer.batch_decode(out, skip_special_tokens=True)
+        # Process each decoded output to extract steps and final answer
+        parsed_results = [self._parse(text, cot_prompt) for text in decoded_outputs]
+        # Separate the parsed components into lists
+        all_steps = [r[0] for r in parsed_results]
+        all_finals = [r[1] for r in parsed_results]
+        all_full_texts = [r[2] for r in parsed_results] # The 'body' after removing template
+        logger.info("Generated and parsed %d sequence(s).", len(decoded_outputs))
+        # --- Return Results ---
+        # The calling code (e.g., the GUI) is responsible for implementing
+        # Self-Consistency voting based on the list of 'final_answers' provided here.
+        return {
+            'sequences': out, # Return raw sequences in case they are needed
+            'full_texts': all_full_texts, # Text body after template removal
+            'reasoning_steps': all_steps,
+            'final_answers': all_finals,
+            # 'consensus_answer' is not computed here, it's done externally.
+            # Keeping the structure consistent with GUI expectation.
+            'consensus_answer': None # Placeholder, computed externally
+        }
     def _parse(self, text: str, cot_prompt: str) -> Tuple[List[str], str, str]:
+        """
+        Parses the generated text to extract reasoning steps and the final answer.
+        Applies regex patterns to find lines matching the step format and the
+        final answer tag. Includes cleanup for stray model artifacts.
+        Args:
+            text (str): The raw text output from the model for a single chain.
+            cot_prompt (str): The exact prompt text that was injected (used to remove it from the output).
+        Returns:
+            Tuple[List[str], str, str]: A tuple containing:
+                                        - A list of extracted reasoning step strings.
+                                        - The extracted final answer string.
+                                        - The full body of the generated text (after removing the prompt).
+        """
+        logger.debug("Parsing generated text...")
+        # Remove the exact injected prompt from the beginning of the text.
+        # This isolates the model's generated continuation.
+        body = text
+        if text.startswith(cot_prompt):
+            body = text[len(cot_prompt):].strip()
+            logger.debug("Removed CoT prompt (%d characters) from beginning.", len(cot_prompt))
+        else:
+            logger.warning("Generated text does not start with the injected CoT prompt. Parsing entire text.")
+            body = text.strip() # Just strip whitespace if template wasn't followed
+        # --- Cleanup stray model artifacts ---
+        # Remove common problematic tags or partial JSON structures that models sometimes emit.
+        # This makes the raw output cleaner before step/answer extraction.
+        logger.debug("Cleaning stray artifacts...")
         body = re.sub(r"<init>.*?</init>", "", body, flags=re.DOTALL)
         body = re.sub(r"<final_output>.*?</final_output>", "", body, flags=re.DOTALL)
+        # Note: Removing all {} might be aggressive if model uses them naturally.
+        # Keeping it as per provided code, but be aware this could remove desired output.
+        # Consider making this optional or more specific if needed.
         body = re.sub(r"\{.*?\}", "", body, flags=re.DOTALL)
+        logger.debug("Artifact cleanup complete.")
+        lines = [l.strip() for l in body.splitlines() if l.strip()] # Split into non-empty, stripped lines
+        steps = [] # List to store extracted steps
+        final_answer = "" # Variable to store the final answer
+        # --- Extract Steps and Final Answer ---
+        # Iterate through lines and apply regex patterns.
+        found_final_answer_line = False
+        for i, line in enumerate(lines):
+            # Check for reasoning step pattern
+            step_match = DEFAULT_STEP_PATTERN.match(line)
+            if step_match:
+                # If a step is found, add the captured group (the text after the number/tag)
+                steps.append(step_match.group(1).strip())
+                logger.debug("Extracted step %d: '%s'", len(steps), steps[-1][:50])
+                # Stop adding steps if we've reached a defined limit (though limit isn't currently enforced after parsing)
+                # if len(steps) >= self.reasoning_steps_limit:
+                #    logger.debug("Reached reasoning steps limit (%d). Stopping step extraction.", self.reasoning_steps_limit)
+                #    # Continue iterating to potentially find the final answer after the limit
+                #    # break # DO NOT break if we still need to find the final answer tag after the limit
+            else:
+                # If it's not a step, check for the final answer tag
+                final_answer_match = self.final_answer_pattern.search(line)
+                if final_answer_match:
+                    # If the final answer tag is found, extract the text following it
+                    final_answer = final_answer_match.group(1).strip()
+                    logger.debug("Extracted final answer tagged: '%s'", final_answer[:50])
+                    found_final_answer_line = True
+                    # Once the final answer tag is found, we can stop processing lines for *this specific pattern*
+                    # However, the provided code breaks the loop entirely here.
+                    # Keeping the break to match the original logic.
+                    break # Stop processing lines after finding the tagged answer
+        # --- Fallback for Final Answer ---
+        # If the specific final answer tag was not found, assume the last non-step line
+        # is the intended final answer. This is a heuristic fallback.
+        if not found_final_answer_line:
+            logger.debug("Final answer tag not found. Applying fallback heuristic.")
+            # Find the last line that is not a step
+            last_non_step_line = ""
+            for line in reversed(lines): # Iterate backwards
+                if line.strip() and not DEFAULT_STEP_PATTERN.match(line):
+                    last_non_step_line = line.strip()
+                    logger.debug("Fallback: Last non-step line found: '%s'", last_non_step_line[:50])
+                    break # Found the last non-step line
+            if last_non_step_line:
+                 # Check if the last non-step line *contains* the final answer tag,
+                 # even if it didn't *start* with it or was the last line processed.
+                 # This handles cases where the tag might be mid-line or in a different format.
+                 fa_match_fallback = self.final_answer_pattern.search(last_non_step_line)
+                 if fa_match_fallback:
+                      final_answer = fa_match_fallback.group(1).strip()
+                      logger.debug("Fallback found tagged answer in last non-step line: '%s'", final_answer[:50])
+                 else:
+                    # If no tag in the last non-step line, just use the line itself
+                    final_answer = last_non_step_line
+                    logger.debug("Fallback using last non-step line as answer: '%s'", final_answer[:50])
             else:
+                 # If no non-empty lines were found, the final answer is empty
+                 final_answer = ""
+                 logger.debug("No lines found in body. Final answer is empty.")
+        logger.debug("Parsing complete. Steps found: %d, Final Answer: '%s'", len(steps), final_answer[:50])
+        return steps, final_answer, body # Return steps, final answer, and the cleaned body text
     def resize_token_embeddings(self, new_size: int):
+        """
+        Resizes the model's token embeddings, useful after adding new tokens
+        to the tokenizer (like a custom PAD token).
+        Only works if the underlying model object has a `resize_token_embeddings` method.
+        Args:
+            new_size (int): The new size of the vocabulary/embedding layer.
+                            Should match the size of the tokenizer's vocabulary.
+        """
+        # Find the actual HF model if wrapped
+        hf_model_instance, _ = self._find_hf_model_and_config(self.model)
+        if hasattr(hf_model_instance, 'resize_token_embeddings'):
+            try:
+                old_size = hf_model_instance.get_input_embeddings().weight.size(0)
+                if new_size != old_size:
+                    hf_model_instance.resize_token_embeddings(new_size)
+                    logger.info("Resized model token embeddings from %d to %d.", old_size, new_size)
+                    # Update model config's vocab size if available
+                    if hasattr(hf_model_instance, 'config') and hasattr(hf_model_instance.config, 'vocab_size'):
+                        hf_model_instance.config.vocab_size = new_size
+                        logger.debug("Updated model config vocab_size to %d.", new_size)
+                else:
+                    logger.info("Embedding size is already %d, no resizing needed.", new_size)
+            except Exception as e:
+                 logger.error("Failed to resize token embeddings: %s", e)
+                 # Attempt cleanup
+                 if torch.cuda.is_available(): torch.cuda.empty_cache()
+                 gc.collect()
         else:
+            logger.error("Cannot resize token embeddings: The underlying model object does not have a 'resize_token_embeddings' method.")
+# Example Usage (Illustrative - requires a real HF model and tokenizer)
+if __name__ == "__main__":
+    print("--- ChainOfThoughtWrapper Example Usage ---")
+    print("This block requires a Hugging Face model to run.")
+    print("Loading a small dummy model for demonstration...")
+    # You would replace this with your actual model loading logic
+    try:
+        # Use a tiny, fast model for a quick test
+        model_id = "hf-internal-testing/tiny-random-gpt2"
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Attempting to load model {model_id} on {device}...")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        # Ensure pad token is set for generation (common requirement)
+        if tokenizer.pad_token_id is None:
+             if tokenizer.eos_token_id is not None:
+                  tokenizer.pad_token_id = tokenizer.eos_token_id
+             else:
+                 # Add a pad token if neither eos nor pad exists
+                 tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+                 model.resize_token_embeddings(len(tokenizer)) # Resize embeddings after adding token
+                 tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('[PAD]')
+                 logger.warning("Added and set [PAD] token, resized embeddings.")
+        # Instantiate the wrapper
+        # Simulate parameters that would come from the GUI
+        simulated_gen_config = GenerationConfig(
+            max_new_tokens=100,
+            temperature=0.8,
+            do_sample=True,
+            num_return_sequences=2, # Simulate asking for 2 chains
+            pad_token_id=tokenizer.pad_token_id, # Pass pad_token_id explicitly
+            eos_token_id=tokenizer.eos_token_id, # Pass eos_token_id explicitly
+        )
+        cot_wrapper = ChainOfThoughtWrapper(
+            model=model,
+            tokenizer=tokenizer,
+            generation_config=simulated_gen_config,
+            device=device,
+            self_consistency=True, # Simulate SC enabled
+            consistency_rounds=2, # Simulate consistency rounds setting
+        )
+        # Prepare input prompt
+        prompt_text = "What is 2 + 2? Think step-by-step."
+        input_enc = tokenizer(prompt_text, return_tensors='pt').to(device)
+        logger.info(f"Generating reasoning for prompt: '{prompt_text}'")
+        # Generate outputs
+        # The num_return_sequences from simulated_gen_config will be used here
+        outputs = cot_wrapper.generate(
+            input_ids=input_enc['input_ids'],
+            attention_mask=input_enc['attention_mask']
+        )
+        # Process results (including simulated Self-Consistency voting logic)
+        print("\n--- Generation Results ---")
+        for i, (full_text, steps, final_answer) in enumerate(zip(outputs['full_texts'], outputs['reasoning_steps'], outputs['final_answers'])):
+            print(f"\n--- Chain {i+1} ---")
+            print("Full Text:")
+            print(full_text)
+            print("\nReasoning Steps:")
+            if steps:
+                for j, step in enumerate(steps):
+                    print(f"  Step {j+1}: {step}")
+            else:
+                print("  [No steps parsed]")
+            print("\nFinal Answer:")
+            print(f"  {final_answer or '[No final answer parsed]'}")
+        # --- Simulate Self-Consistency Voting (as would be done in GUI) ---
+        print("\n--- Self-Consistency Voting ---")
+        final_answers = [ans for ans in outputs['final_answers'] if ans.strip()] # Filter empty answers
+        if final_answers:
+            answer_counts = Counter(final_answers)
+            most_common_answer, count = answer_counts.most_common(1)[0]
+            print(f"Raw Answers Submitted for Voting: {final_answers}")
+            print(f"Answer Counts: {dict(answer_counts)}")
+            print(f"Consensus Answer: '{most_common_answer}' (Voted by {count} chain(s))")
+        else:
+            print("No valid final answers found for voting.")
+    except Exception as e:
+        logger.error("Example usage failed: %s", e)
+        import traceback
+        traceback.print_exc() # Print detailed traceback for the example failure
+    print("\n--- Example Usage End ---")