Spaces:

UKPLab
/

scicoqa

Running

+# Dockerfile for ScicoQA Demo - HuggingFace Spaces
+FROM python:3.11-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements first for better caching
+COPY requirements.txt requirements.txt
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . /app
+# Create data directories
+RUN mkdir -p /app/data/papers /app/data/repos-raw
+# Set environment variables for Streamlit
+ENV STREAMLIT_SERVER_PORT=7860
+ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
+ENV STREAMLIT_SERVER_HEADLESS=true
+ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
+# Expose port for HuggingFace Spaces
+EXPOSE 7860
+# Run Streamlit app
+CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0", "--server.headless", "true"]

README.md CHANGED Viewed

@@ -1,10 +1,8 @@
 ---
-title: Scicoqa
-emoji: 🌖
 colorFrom: indigo
 colorTo: blue
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SciCoQA Discrepancy Detection
+emoji: 🔬
 colorFrom: indigo
 colorTo: blue
 sdk: docker
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,981 @@

+"""Main Streamlit app for ScicoQA Discrepancy Detection Demo."""
+import logging
+import os
+import time
+from pathlib import Path
+import streamlit as st
+from dotenv import load_dotenv
+from core.arxiv2md_demo import Arxiv2MD
+from core.code_loader_demo import CodeLoader
+from core.llm_demo import LLM
+from core.model_config import (
+    PROVIDER_PRESETS,
+    create_local_model_config,
+    create_provider_model_config,
+    get_api_key_env_name,
+    get_provider_from_model,
+)
+from core.ollama_models import fetch_ollama_models
+from core.openrouter_models import fetch_free_models, get_model_config
+from core.prompt_demo import Prompt
+from core.token_counter_demo import TokenCounter
+from parsing import parse_discrepancies
+# Load environment variables
+load_dotenv()
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+# Page configuration
+st.set_page_config(
+    page_title="SciCoQA Paper- Code Discrepancy Detection",
+    page_icon="🔬",
+    layout="wide",
+    initial_sidebar_state=400,
+)
+# Constants
+MAX_CONTEXT_SIZE = 131072  # Default max context
+MAX_TOKENS_BUFFER = 0.9  # Use 90% of max tokens
+def validate_urls(arxiv_url: str, github_url: str) -> tuple[bool, str]:
+    """Validate input URLs."""
+    if not arxiv_url:
+        return False, "Please provide an arXiv URL"
+    if not github_url:
+        return False, "Please provide a GitHub URL"
+    if "arxiv.org" not in arxiv_url and not arxiv_url.startswith("http"):
+        # Try to construct URL from ID
+        if arxiv_url.replace(".", "").replace("v", "").isdigit():
+            arxiv_url = f"https://arxiv.org/abs/{arxiv_url}"
+        else:
+            return False, "Invalid arXiv URL format"
+    if "github.com" not in github_url:
+        return False, "Please provide a valid GitHub URL"
+    return True, ""
+def validate_files(paper_file, code_file) -> tuple[bool, str]:
+    """Validate uploaded files."""
+    if paper_file is None:
+        return False, "Please upload a paper markdown file"
+    if code_file is None:
+        return False, "Please upload a repository text file"
+    # Check file types
+    if paper_file.name and not paper_file.name.endswith(('.md', '.markdown', '.txt')):
+        return False, "Paper file should be a markdown (.md) or text (.txt) file"
+    if code_file.name and not code_file.name.endswith('.txt'):
+        return False, "Repository file should be a text (.txt) file"
+    return True, ""
+def process_discrepancy_detection(
+    paper_text: str | None = None,
+    code_text: str | None = None,
+    arxiv_url: str | None = None,
+    github_url: str | None = None,
+    model_config: dict | None = None,
+):
+    """Main processing pipeline for discrepancy detection."""
+    results = {
+        "paper_text": None,
+        "code_prompt": None,
+        "prompt": None,
+        "llm_response": None,
+        "discrepancies": None,
+        "error": None,
+        "step_timings": None,
+    }
+    # Use a single compact status container
+    step_timings = {}  # Store timings for each step
+    # Note: Uploaded files (paper_text, code_text) are only in memory and never saved
+    # URL fetches (arxiv_url, github_url) use persistent cache directories for performance
+    try:
+        with st.status("🔄 Processing...", expanded=False) as status:
+            try:
+                # Step 1: Fetch/process paper
+                step_start = time.time()
+                if arxiv_url:
+                    # Fetch from arXiv - use persistent cache directory
+                    status.update(label="📄 Fetching paper from arXiv...", state="running")
+                    try:
+                        # Use persistent directory for caching (OK to save fetched papers)
+                        arxiv2md = Arxiv2MD(output_dir=Path("data/papers"))
+                        paper_text = arxiv2md(arxiv_url)
+                        results["paper_text"] = paper_text
+                        step_time = time.time() - step_start
+                        step_timings["Paper Fetch"] = step_time
+                        st.write(f"✅ Paper fetched: {step_time:.1f}s")
+                        status.update(
+                            label=f"✅ Paper fetched ({step_time:.1f}s)",
+                            state="running",
+                        )
+                    except Exception as e:
+                        error_msg = f"Error fetching paper: {str(e)}"
+                        logger.error(error_msg)
+                        results["error"] = error_msg
+                        status.update(label="❌ Error fetching paper", state="error")
+                        return results
+                else:
+                    # Use provided paper text
+                    status.update(label="📄 Processing paper...", state="running")
+                    try:
+                        results["paper_text"] = paper_text
+                        step_time = time.time() - step_start
+                        step_timings["Paper Processing"] = step_time
+                        st.write(f"✅ Paper processed: {step_time:.1f}s")
+                        status.update(
+                            label=f"✅ Paper processed ({step_time:.1f}s)",
+                            state="running",
+                        )
+                    except Exception as e:
+                        error_msg = f"Error processing paper: {str(e)}"
+                        logger.error(error_msg)
+                        results["error"] = error_msg
+                        status.update(label="❌ Error processing paper", state="error")
+                        return results
+                # Step 2: Fetch/process code
+                step_start = time.time()
+                code_loader = None
+                if github_url:
+                    # Fetch from GitHub - use persistent cache directory
+                    status.update(label="📦 Fetching code from GitHub...", state="running")
+                    try:
+                        # Use persistent directory for caching (OK to save fetched repos)
+                        code_loader = CodeLoader(
+                            github_url=github_url,
+                            max_file_size_mb=1.0,
+                            raw_repo_dir=Path("data/repos-raw"),
+                        )
+                        step_time = time.time() - step_start
+                        step_timings["Repository Clone"] = step_time
+                        st.write(f"✅ Repository cloned: {step_time:.1f}s")
+                        status.update(
+                            label=f"✅ Repository cloned ({step_time:.1f}s)",
+                            state="running",
+                        )
+                    except Exception as e:
+                        error_msg = f"Error cloning repository: {str(e)}"
+                        logger.error(error_msg)
+                        results["error"] = error_msg
+                        status.update(label="❌ Error cloning repository", state="error")
+                        return results
+                else:
+                    # Code text is already provided
+                    status.update(label="📦 Processing repository...", state="running")
+                    step_time = time.time() - step_start
+                    step_timings["Code Processing"] = step_time
+                    st.write(f"✅ Repository processed: {step_time:.1f}s")
+                    status.update(
+                        label=f"✅ Repository processed ({step_time:.1f}s)",
+                        state="running",
+                    )
+                # Step 5: Calculate tokens and prepare prompt
+                step_start = time.time()
+                status.update(label="📝 Preparing prompt...", state="running")
+                try:
+                    # Use provided model config
+                    tokenizer_name = model_config["tokenizer"]
+                    max_context = model_config["max_context"]
+                    token_counter = TokenCounter(model=tokenizer_name)
+                    # Calculate tokens for paper + prompt template
+                    prompt_template = Prompt("discrepancy_generation")
+                    intermediate_prompt = prompt_template(paper=paper_text, code="")
+                    tokens_intermediate_prompt = token_counter(intermediate_prompt)
+                    # Calculate remaining tokens for code
+                    max_total_tokens = int(max_context * MAX_TOKENS_BUFFER)
+                    remaining_code_tokens = max_total_tokens - tokens_intermediate_prompt
+                    logger.info(f"Tokens in intermediate prompt: {tokens_intermediate_prompt}")
+                    logger.info(f"Remaining tokens for code: {remaining_code_tokens}")
+                    # Get code prompt with token limit
+                    if code_loader:
+                        # Use CodeLoader for GitHub repos
+                        code_prompt = code_loader.get_code_prompt(
+                            token_counter=token_counter,
+                            max_tokens=remaining_code_tokens,
+                        )
+                    else:
+                        # Truncate code text to fit within token limit
+                        # Simple approach: count tokens as we add content
+                        code_prompt = ""
+                        code_tokens = 0
+                        code_lines = code_text.split('\n')
+                        for line in code_lines:
+                            line_with_newline = line + '\n'
+                            line_tokens = token_counter(line_with_newline)
+                            if code_tokens + line_tokens > remaining_code_tokens:
+                                logger.warning(f"Truncating code at {code_tokens} tokens (limit: {remaining_code_tokens})")
+                                break
+                            code_prompt += line_with_newline
+                            code_tokens += line_tokens
+                    results["code_prompt"] = code_prompt
+                    # Construct final prompt
+                    final_prompt = prompt_template(paper=paper_text, code=code_prompt)
+                    results["prompt"] = final_prompt
+                    final_tokens = token_counter(final_prompt)
+                    logger.info(f"Total tokens in final prompt: {final_tokens}")
+                    # Calculate max_tokens for completion (respecting model's context limit)
+                    # Leave some buffer for safety (use 95% of remaining context)
+                    max_context = model_config["max_context"]
+                    remaining_for_completion = max_context - final_tokens
+                    if remaining_for_completion <= 0:
+                        error_msg = f"Prompt too long: {final_tokens} tokens exceeds model's context limit of {max_context} tokens"
+                        logger.error(error_msg)
+                        results["error"] = error_msg
+                        status.update(label="❌ Prompt too long", state="error")
+                        return results
+                    # Use 95% of remaining to be safe, but ensure at least some tokens
+                    max_tokens_for_completion = max(1, int(remaining_for_completion * 0.95))
+                    logger.info(f"Max context: {max_context}, Input tokens: {final_tokens}, Remaining: {remaining_for_completion}, Max completion tokens: {max_tokens_for_completion}")
+                    step_time = time.time() - step_start
+                    step_timings["Prompt Preparation"] = step_time
+                    st.write(f"✅ Prompt prepared: {step_time:.1f}s ({final_tokens:,} tokens, max output: {max_tokens_for_completion:,} tokens)")
+                    status.update(
+                        label=f"✅ Prompt prepared ({step_time:.1f}s, {final_tokens:,} tokens)",
+                        state="running",
+                    )
+                except Exception as e:
+                    error_msg = f"Error preparing prompt: {str(e)}"
+                    logger.error(error_msg)
+                    results["error"] = error_msg
+                    status.update(label="❌ Error preparing prompt", state="error")
+                    return results
+                # Step 6: Detect discrepancies with LLM
+                step_start = time.time()
+                status.update(label="🤖\uFE0F Detecting discrepancies (this may take a while)...", state="running")
+                try:
+                    # Extract model configuration
+                    model = model_config["model"]
+                    api_key = model_config.get("api_key")
+                    api_base = model_config.get("api_base")
+                    max_context = model_config.get("max_context")
+                    llm = LLM(
+                        model=model,
+                        api_key=api_key,
+                        api_base=api_base,
+                        temperature=1.0,
+                        top_p=1.0,
+                        reasoning_effort="high",
+                        max_context=max_context,
+                        max_tokens=max_tokens_for_completion,  # Respect model's context limit
+                    )
+                    response = llm(final_prompt)
+                    results["llm_response"] = response
+                    # Extract content from response
+                    choices = response.get("choices", [])
+                    if not choices:
+                        raise ValueError("No choices in LLM response")
+                    content = (
+                        choices[0]
+                        .get("message", {})
+                        .get("content", "")
+                    )
+                    if not content:
+                        raise ValueError("Empty content in LLM response")
+                    # Parse discrepancies
+                    discrepancies = parse_discrepancies(content)
+                    results["discrepancies"] = discrepancies
+                    step_time = time.time() - step_start
+                    step_timings["LLM Inference"] = step_time
+                    total_time = sum(step_timings.values())
+                    st.write(f"✅ LLM inference: {step_time:.1f}s")
+                    st.write("---")
+                    st.write(f"**Total time: {total_time:.1f}s**")
+                    if discrepancies:
+                        count = len(discrepancies)
+                        discrepancy_text = "discrepancy" if count == 1 else "discrepancies"
+                        status.update(
+                            label=f"✅ Complete! Found {count} {discrepancy_text} ({total_time:.1f}s total)",
+                            state="complete",
+                        )
+                    else:
+                        status.update(
+                            label=f"✅ Complete! No discrepancies found ({total_time:.1f}s total)",
+                            state="complete",
+                        )
+                except Exception as e:
+                    error_msg = f"Error during LLM inference: {str(e)}"
+                    logger.error(error_msg)
+                    results["error"] = error_msg
+                    status.update(label="❌ Error during inference", state="error")
+                    return results
+            except Exception as e:
+                error_msg = f"Unexpected error: {str(e)}"
+                logger.error(error_msg, exc_info=True)
+                results["error"] = error_msg
+                status.update(label="❌ Unexpected error", state="error")
+                return results
+        results["step_timings"] = step_timings
+        return results
+    except Exception as e:
+        # Handle any errors that occur outside the status context
+        error_msg = f"Unexpected error: {str(e)}"
+        logger.error(error_msg, exc_info=True)
+        results["error"] = error_msg
+        return results
+def main():
+    """Main Streamlit app."""
+    st.title("🔬 :rainbow[SciCoQA] Paper-Code Discrepancy Detection")
+    st.markdown(
+        """
+        _Detect discrepancies between scientific papers and their code implementations._
+        """
+    )
+    # About section in main area
+    with st.expander("ℹ️ About", expanded=False):
+        st.markdown(
+            """
+            This tool is a demo of our research paper on detecting discrepancies between scientific papers and their
+            code implementations. You can read our paper here: [arXiv:2601.XXXX](https://arxiv.org/pdf/2601.XXXX).
+            This tool helps researchers and developers identify inconsistencies between scientific papers and their
+            corresponding code implementations. Such discrepancies can lead to reproducibility issues, incorrect
+            implementations, or misunderstandings of the research. By using advanced LLMs to analyze both the paper
+            text and code, this app automatically detects mismatches in algorithms, parameters, data processing steps,
+            and other implementation details.
+            **⚠️ Important Limitations:**
+            Our research found that **recall is still low** - meaning the tool may miss some discrepancies.
+            **All outputs should be used with human verification** and should not be relied upon as the sole method
+            for discrepancy detection.
+            **LLM Provider Recommendations:**
+            - **Free Models (OpenRouter)**: Best for quick checks of already public paper+code combinations
+            - **Local Models (Ollama/vLLM)**: Best for privacy-sensitive content, e.g. for unpublished papers or code
+            - **Provider Models (OpenAI, Anthropic, etc.)**: Best for high precision and best recall
+            **Features:**
+            - Support for multiple LLM providers (free, local, or premium models)
+            - Automatic content fetching from arXiv and GitHub
+            - File upload support for custom papers and repositories
+            - Secure API key handling (keys never stored or logged)
+            **Resources:**
+            - 📦 **Code**: [GitHub Repository](https://github.com/UKPLab/scicoqa)
+            - 📊 **Dataset**: [Hugging Face Dataset](https://huggingface.co/datasets/ukplab/scicoqa)
+            - 🌐 **Project Website**: [ukplab.github.io/scicoqa](https://ukplab.github.io/scicoqa)
+            **Citation:**
+            If you find this tool useful, please cite our paper:
+            ```bibtex
+            @article{scicoqa2026,
+              title     = {SciCoQA: Quality Assurance for Scientific Paper-Code Alignment},
+              author    = {Baumgärtner, Tim and Gurevych, Iryna},
+              journal   = {arXiv preprint arXiv:XXXX.XXXXX},
+              year      = {2026},
+              url       = {https://github.com/UKPLab/scicoqa}
+            }
+            ```
+            """
+        )
+    # ========== SIDEBAR: Model Configuration ==========
+    with st.sidebar:
+        st.header("🤖\uFE0F Model Configuration")
+        # Determine label based on current selection
+        model_config = None
+        model_name = None
+        display_model_name = None
+        # Check if we have a model config in session state
+        if "model_config" in st.session_state and st.session_state.model_config:
+            existing_config = st.session_state.model_config
+            display_model_name = existing_config.get("name") or existing_config.get("model", "Unknown")
+        if display_model_name:
+            st.caption(f"Current: {display_model_name}")
+        # Model type selection
+        model_type = st.radio(
+            "Model Type",
+            options=["Free Models (OpenRouter)", "Local Model (Ollama/vLLM)", "Provider (OpenAI, Anthropic, Gemini, etc.)"],
+            help="Select free models (no API key), local models (Ollama/vLLM), or provider models (requires API key)",
+            key="model_type_radio",
+            index=0,  # Default to Free Models
+        )
+        # Store in session state for access outside sidebar
+        st.session_state.model_type = model_type
+        st.divider()
+        # Model selection based on type
+        if model_type == "Free Models (OpenRouter)":
+            # Fetch free models from OpenRouter API (uses file-based cache, refreshes daily)
+            if "free_models_cache" not in st.session_state:
+                with st.spinner("Loading free models from OpenRouter..."):
+                    free_models_raw = fetch_free_models()
+                    st.session_state.free_models_cache = free_models_raw
+            free_models_raw = st.session_state.free_models_cache
+            if not free_models_raw:
+                st.error("⚠️ Could not fetch free models from OpenRouter. Please try again later or use a different model type.")
+                model_config = None
+            else:
+                # Show privacy warning
+                st.warning(
+                    "⚠️ **Privacy Notice**: Free models are provided via [OpenRouter](https://openrouter.ai). "
+                    "The model provider may log your prompts and outputs. For enhanced privacy, consider using Local or Provider models with your own API keys."
+                )
+                # Create model options from fetched models
+                model_options = {get_model_config(m)["name"]: get_model_config(m) for m in free_models_raw}
+                if model_options:
+                    # Find default index for gpt-oss
+                    model_names = list(model_options.keys())
+                    default_index = 0
+                    for idx, name in enumerate(model_names):
+                        if "nemotron 3 nano 30b" in name.lower():
+                            default_index = idx
+                            break
+                    model_name = st.selectbox(
+                        "Select Free Model",
+                        options=model_names,
+                        help="Free models via OpenRouter (no API key required)",
+                        key="free_model_select",
+                        index=default_index,
+                    )
+                    model_config = model_options[model_name]
+                else:
+                    st.error("⚠️ No free models available. Please try again later or use a different model type.")
+                    model_config = None
+        elif model_type == "Local Model (Ollama/vLLM)":
+            st.info("🖥️ **Local Model**: Use models running locally via Ollama or vLLM (OpenAI-compatible server).")
+            local_model_type = st.radio(
+                "Local Server Type",
+                options=["Ollama", "vLLM (OpenAI-compatible)"],
+                help="Select the type of local server",
+                key="local_server_type",
+            )
+            if local_model_type == "Ollama":
+                # API Base URL comes first
+                api_base = st.text_input(
+                    "API Base URL",
+                    value="http://localhost:11434",
+                    help="Ollama API base URL",
+                    key="ollama_api_base",
+                )
+                # Query Ollama for available models if API base is provided
+                model_input = None
+                if api_base and api_base.strip():
+                    try:
+                        with st.spinner("Fetching available models from Ollama..."):
+                            available_models = fetch_ollama_models(api_base.strip())
+                        if available_models:
+                            model_input = st.selectbox(
+                                "Select Model",
+                                options=available_models,
+                                help="Select a model from your Ollama server",
+                                key="ollama_model_select",
+                            )
+                        else:
+                            st.warning("⚠️ No models found or unable to connect to Ollama. You can still enter a model name manually.")
+                            model_input = st.text_input(
+                                "Model Name (manual entry)",
+                                placeholder="e.g., llama2, mistral, codellama",
+                                help="Enter the Ollama model name manually (without 'ollama/' prefix)",
+                                key="ollama_model_input_manual",
+                            )
+                    except Exception as e:
+                        logger.error(f"Error fetching Ollama models: {e}")
+                        st.warning(f"⚠️ Could not fetch models from Ollama: {str(e)}. You can still enter a model name manually.")
+                        model_input = st.text_input(
+                            "Model Name (manual entry)",
+                            placeholder="e.g., llama2, mistral, codellama",
+                            help="Enter the Ollama model name manually (without 'ollama/' prefix)",
+                            key="ollama_model_input_manual",
+                        )
+                else:
+                    st.info("💡 Enter the API Base URL above to see available models, or enter a model name manually below.")
+                    model_input = st.text_input(
+                        "Model Name",
+                        placeholder="e.g., llama2, mistral, codellama",
+                        help="Enter the Ollama model name (without 'ollama/' prefix)",
+                        key="ollama_model_input",
+                    )
+                max_context = st.number_input(
+                    "Max Context (tokens)",
+                    min_value=1000,
+                    max_value=1000000,
+                    value=131072,
+                    step=1000,
+                    help="Maximum context window size in tokens",
+                    key="ollama_max_context",
+                )
+                if model_input and api_base:
+                    model_name = f"ollama/{model_input}"
+                    model_config = create_local_model_config(
+                        model=model_name,
+                        api_base=api_base.strip(),
+                        max_context=max_context,
+                    )
+            else:  # vLLM
+                model_input = st.text_input(
+                    "Model Name",
+                    placeholder="e.g., gpt-3.5-turbo, mistralai/Mistral-7B-Instruct-v0.1",
+                    help="Enter the model name for vLLM",
+                    key="vllm_model_input",
+                )
+                api_base = st.text_input(
+                    "API Base URL",
+                    value="http://localhost:8000/v1",
+                    help="vLLM API base URL (OpenAI-compatible endpoint)",
+                    key="vllm_api_base",
+                )
+                max_context = st.number_input(
+                    "Max Context (tokens)",
+                    min_value=1000,
+                    max_value=1000000,
+                    value=131072,
+                    step=1000,
+                    help="Maximum context window size in tokens",
+                    key="vllm_max_context",
+                )
+                if model_input:
+                    model_name = model_input
+                    model_config = create_local_model_config(
+                        model=model_name,
+                        api_base=api_base,
+                        max_context=max_context,
+                    )
+        else:  # Provider Model
+            st.info("🔑 **Provider Model**: Use your own API keys to access premium models. Your keys are never stored, logged, or displayed.")
+            provider_subtype = st.radio(
+                "Model Selection",
+                options=["Preset", "Custom"],
+                help="Select from preset models or enter a custom model",
+                key="provider_subtype",
+            )
+            if provider_subtype == "Preset":
+                model_name = st.selectbox(
+                    "Select Model",
+                    options=list(PROVIDER_PRESETS.keys()),
+                    help="Select a preset model (API key required)",
+                    key="preset_model_select",
+                )
+                preset_config = PROVIDER_PRESETS[model_name]
+                api_key_env = preset_config["api_key_env"]
+                api_key_label = api_key_env.replace("_", " ").title()
+                api_key = st.text_input(
+                    f"{api_key_label}",
+                    type="password",
+                    help=f"Enter your {api_key_label}. Your key is never stored, logged, or displayed.",
+                    placeholder=f"sk-..." if "OPENAI" in api_key_env else "Enter API key",
+                    key="preset_api_key",
+                )
+                if api_key:
+                    model_config = create_provider_model_config(
+                        model=preset_config["model"],
+                        api_key=api_key,
+                        max_context=preset_config["max_context"],
+                        tokenizer=preset_config["tokenizer"],
+                    )
+            else:  # Custom
+                custom_model_name = st.text_input(
+                    "Model Name (litellm format)",
+                    placeholder="e.g., gpt-4o, claude-3-5-sonnet, gemini/gemini-1.5-pro",
+                    help="Enter the model name in litellm format. See [litellm documentation](https://docs.litellm.ai/docs/providers) for supported formats.",
+                    key="custom_model_name",
+                )
+                custom_max_context = st.number_input(
+                    "Max Context (tokens)",
+                    min_value=1000,
+                    max_value=10000000,
+                    value=128000,
+                    step=1000,
+                    help="Maximum context window size in tokens",
+                    key="custom_max_context",
+                )
+                if custom_model_name:
+                    provider = get_provider_from_model(custom_model_name)
+                    api_key_env = get_api_key_env_name(provider)
+                    api_key_label = api_key_env.replace("_", " ").title()
+                    api_key = st.text_input(
+                        f"{api_key_label}",
+                        type="password",
+                        help=f"Enter your {api_key_label}. Your key is never stored, logged, or displayed.",
+                        placeholder=f"sk-..." if "OPENAI" in api_key_env else "Enter API key",
+                        key="custom_api_key",
+                    )
+                    if api_key:
+                        model_name = custom_model_name
+                        model_config = create_provider_model_config(
+                            model=custom_model_name,
+                            api_key=api_key,
+                            max_context=custom_max_context,
+                        )
+                st.markdown(
+                    "📚 **Need help with model format?** See the [litellm documentation](https://docs.litellm.ai/docs/providers) "
+                    "for supported providers and model naming conventions."
+                )
+                st.caption("🔒 Your API key is secure: never stored, logged, or displayed")
+        # Show model info if model is selected
+        if model_config:
+            display_name = model_config.get("name") or model_config.get("model", model_name or "Unknown")
+            st.caption(f"📊 Max Context: {model_config['max_context']:,} tokens")
+    # ========== MAIN AREA: Input Form and Results ==========
+    # Store model config in session state for next render
+    if model_config:
+        st.session_state.model_config = model_config
+        st.session_state.model_name = model_config.get("name") or model_config.get("model", model_name or "Unknown")
+    # Input form
+    with st.form("discrepancy_form"):
+        # Input method selection using tabs
+        tab_links, tab_files = st.tabs(["arXiv and GitHub Links", "Upload Paper and Code Files"])
+        # Initialize variables
+        arxiv_url = None
+        github_url = None
+        paper_file = None
+        code_file = None
+        input_method = None
+        with tab_links:
+            col1, col2 = st.columns(2)
+            with col1:
+                arxiv_url = st.text_input(
+                    "arXiv Paper",
+                    value=st.session_state.get("example_arxiv_url", ""),
+                    placeholder="https://arxiv.org/abs/2006.12834 or 2006.12834",
+                    help="Enter the arXiv paper URL or just the paper ID",
+                    label_visibility="visible",
+                )
+            with col2:
+                github_url = st.text_input(
+                    "GitHub Code",
+                    value=st.session_state.get("example_github_url", ""),
+                    placeholder="https://github.com/username/repo",
+                    help="Enter the full GitHub repository URL",
+                    label_visibility="visible",
+                )
+            if arxiv_url or github_url:
+                input_method = "arXiv and GitHub Links"
+        with tab_files:
+            # Instructions section for file preparation
+            with st.expander("📖 How to prepare files", expanded=False):
+                st.markdown("""
+                <h3>Converting PDF to Markdown with Pandoc</h3>
+                1. Install pandoc:
+                ```
+                brew install pandoc
+                ```
+                For installing pandoc on Windows or Linux, see the [pandoc documentation](https://pandoc.org/installing.html).
+                2. Convert your latex to markdown:
+                ```bash
+                pandoc main.tex -f latex -t markdown -s --wrap=none -o paper.md
+                ```
+                <h3>Converting Repository to Text with Gitingest</h3>
+                1. Install gitingest:
+                ```bash
+                pip install gitingest
+                ```
+                2. Generate repository text file:
+                ```bash
+                gitingest https://github.com/your-username/your-repo \\
+                  --token YOUR_GITHUB_TOKEN \\
+                  -i "*.c,*.cc,*.cpp,*.cu,*.h,*.hpp,*.java,*.jl,*.m,*.matlab,Makefile,*.md,*.pl,*.ps1,*.py,*.r,*.sh,config.txt,*.rs,readme.txt,requirements_dev.txt,requirements-dev.txt,requirements.dev.txt,requirements.txt,*.scala,*.yaml,*.yml" -o repo.txt
+                ```
+                **Note**: Modify the file extension list to include the files you want to include in the repository text file. For private repositories, you'll need a GitHub token. For public repositories, you can omit the `--token` parameter.
+                """, unsafe_allow_html=True)
+            col1, col2 = st.columns(2)
+            with col1:
+                paper_file = st.file_uploader(
+                    "Paper Markdown File",
+                    type=["md", "markdown", "txt"],
+                    help="Upload the paper as a markdown file",
+                    label_visibility="visible",
+                )
+            with col2:
+                code_file = st.file_uploader(
+                    "Repository Text File",
+                    type=["txt"],
+                    help="Upload the repository as a text file (generated using gitingest)",
+                    label_visibility="visible",
+                )
+            if paper_file or code_file:
+                input_method = "Upload Paper and Code Files"
+        submitted = st.form_submit_button("Detect Discrepancies", type="primary", use_container_width=True)
+        # Store model info in session state
+        st.session_state.model_config = model_config
+    # Process form submission
+    if submitted:
+        # Determine input method based on which inputs are filled
+        # Check if files are provided (Upload method) - prioritize files if any are uploaded
+        if paper_file is not None or code_file is not None:
+            is_valid, error_msg = validate_files(paper_file, code_file)
+            if not is_valid:
+                st.error(error_msg)
+                return
+            # Read file contents
+            try:
+                paper_text = paper_file.read().decode("utf-8") if paper_file else None
+                code_text = code_file.read().decode("utf-8") if code_file else None
+            except Exception as e:
+                st.error(f"Error reading files: {str(e)}")
+                return
+            arxiv_url = None
+            github_url = None
+        # Otherwise check if URLs are provided (Links method)
+        elif arxiv_url or github_url:
+            is_valid, error_msg = validate_urls(arxiv_url, github_url)
+            if not is_valid:
+                st.error(error_msg)
+                return
+            paper_text = None
+            code_text = None
+        else:
+            st.error("Please provide either arXiv and GitHub links, or upload paper and code files.")
+            return
+        # Clear example values after form submission
+        if "example_arxiv_url" in st.session_state:
+            del st.session_state["example_arxiv_url"]
+        if "example_github_url" in st.session_state:
+            del st.session_state["example_github_url"]
+        # Validate model selection
+        if model_config is None:
+            st.error("Please select a valid model.")
+            return
+        # Validate API key for provider models
+        model_type = st.session_state.get("model_type", "Provider (OpenAI, Anthropic, Gemini, etc.)")
+        if model_type == "Provider (OpenAI, Anthropic, Gemini, etc.)":
+            if "api_key" not in model_config or not model_config.get("api_key"):
+                st.error("⚠️ API key required for provider models. Please enter your API key.")
+                return
+        # Process
+        with st.spinner("Processing..."):
+            results = process_discrepancy_detection(
+                paper_text=paper_text,
+                code_text=code_text,
+                arxiv_url=arxiv_url,
+                github_url=github_url,
+                model_config=model_config,
+            )
+        # Display results
+        if results["error"]:
+            st.error(f"❌ Error: {results['error']}")
+            return
+        # Display discrepancies
+        st.divider()
+        st.header("Results")
+        if results["discrepancies"]:
+            count = len(results["discrepancies"])
+            discrepancy_text = "discrepancy" if count == 1 else "discrepancies"
+            st.success(f"Found {count} {discrepancy_text}")
+            # Display each discrepancy in a tab
+            tab_labels = [f"Discrepancy {idx}" for idx in range(1, count + 1)]
+            tabs = st.tabs(tab_labels)
+            for idx, (tab, discrepancy) in enumerate(zip(tabs, results["discrepancies"])):
+                with tab:
+                    st.markdown(discrepancy)
+            st.divider()
+        else:
+            st.info("✅ No discrepancies found between the paper and code.")
+            st.divider()
+        # Technical Details - Combined debug sections
+        with st.expander("🔧 Technical Details", expanded=False):
+            # Raw prompt section
+            if results["prompt"]:
+                st.subheader("📝 Raw Prompt")
+                st.markdown("**Final prompt sent to the LLM (after truncation):**")
+                model_config = st.session_state.get("model_config")
+                if model_config:
+                    tokenizer_name = model_config["tokenizer"]
+                    token_counter = TokenCounter(model=tokenizer_name)
+                    prompt_tokens = token_counter(results["prompt"])
+                    st.caption(f"Prompt tokens: {prompt_tokens:,}")
+                # Make prompt scrollable
+                st.markdown(
+                    """
+                    <style>
+                    .prompt-code-wrapper pre {
+                        max-height: 400px;
+                        overflow-y: auto;
+                    }
+                    </style>
+                    <div class="prompt-code-wrapper">
+                    """,
+                    unsafe_allow_html=True
+                )
+                st.code(results["prompt"], language="text")
+                st.markdown("</div>", unsafe_allow_html=True)
+                st.divider()
+            # Raw output section
+            if results["llm_response"]:
+                st.subheader("📄 Raw LLM Output")
+                content = (
+                    results["llm_response"]
+                    .get("choices", [{}])[0]
+                    .get("message", {})
+                    .get("content", "")
+                )
+                # Show token count instead of character count
+                model_config = st.session_state.get("model_config")
+                if model_config:
+                    tokenizer_name = model_config["tokenizer"]
+                    token_counter = TokenCounter(model=tokenizer_name)
+                    output_tokens = token_counter(content)
+                    st.caption(f"Output tokens: {output_tokens:,}")
+                st.code(content, language="yaml")
+                st.divider()
+            # Step timing information
+            if results.get("step_timings"):
+                st.subheader("⏱️ Step Timing")
+                step_timings = results["step_timings"]
+                total_time = sum(step_timings.values())
+                # Display timing for each step
+                for step_name, step_time in step_timings.items():
+                    percentage = (step_time / total_time * 100) if total_time > 0 else 0
+                    st.write(f"**{step_name}**: {step_time:.2f}s ({percentage:.1f}%)")
+                st.metric("**Total Time**", f"{total_time:.2f}s")
+                st.divider()
+            # Debug info
+            st.subheader("🔍 Debug Information")
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                # Get model config from session state for token counting
+                model_config = st.session_state.get("model_config")
+                if model_config:
+                    tokenizer_name = model_config["tokenizer"]
+                    token_counter = TokenCounter(model=tokenizer_name)
+                    if results["paper_text"]:
+                        paper_tokens = token_counter(results["paper_text"])
+                        st.metric("Paper Tokens", f"{paper_tokens:,}")
+                    if results["code_prompt"]:
+                        code_tokens = token_counter(results["code_prompt"])
+                        st.metric("Code Tokens", f"{code_tokens:,}")
+            with col2:
+                if results["llm_response"]:
+                    usage = results["llm_response"].get("usage", {})
+                    if usage:
+                        input_tokens = usage.get("prompt_tokens", "N/A")
+                        output_tokens = usage.get("completion_tokens", "N/A")
+                        st.metric("Input Tokens", f"{input_tokens:,}" if input_tokens != "N/A" else "N/A")
+                        st.metric("Output Tokens", f"{output_tokens:,}" if output_tokens != "N/A" else "N/A")
+            with col3:
+                if results["llm_response"]:
+                    usage = results["llm_response"].get("usage", {})
+                    if usage:
+                        total_tokens = usage.get("total_tokens", "N/A")
+                        st.metric("Total Tokens", f"{total_tokens:,}" if total_tokens != "N/A" else "N/A")
+                    # Extract cost from response metadata
+                    cost = results["llm_response"].get("metadata", {}).get("cost", 0.0)
+                    if cost > 0:
+                        st.metric("Cost", f"${cost:.4f}")
+                    else:
+                        st.metric("Cost", "Free")
+if __name__ == "__main__":
+    main()

core/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Core modules for ScicoQA demo
2	+
3	+

core/arxiv2md_demo.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""Standalone arxiv2md integration for converting arXiv papers to markdown."""
+import hashlib
+import logging
+import os
+import re
+from pathlib import Path
+from urllib.parse import urlparse
+import requests
+logger = logging.getLogger(__name__)
+class Arxiv2MD:
+    """Convert arXiv papers to markdown using arxiv2md API."""
+    API_BASE = "https://arxiv2md.org/api/markdown"
+    RATE_LIMIT_RPM = 30  # 30 requests per minute per IP
+    def __init__(self, output_dir: Path = Path("data") / "papers"):
+        self.output_dir = output_dir
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    def _extract_paper_id(self, arxiv_url: str) -> str:
+        """Extract paper ID from arXiv URL."""
+        logger.info(f"Extracting paper ID from URL: {arxiv_url}")
+        # Handle different arXiv URL formats
+        if "arxiv.org" in arxiv_url:
+            # Remove version suffix if present (e.g., v1, v2)
+            arxiv_url = re.sub(r"v\d+$", "", arxiv_url)
+            # Extract ID from URL
+            parts = arxiv_url.split("/")
+            paper_id = parts[-1].replace(".pdf", "").replace(".html", "")
+            logger.info(f"Extracted arXiv ID: {paper_id}")
+            return paper_id
+        else:
+            # Assume it's already an ID
+            paper_id = arxiv_url.replace(".pdf", "").replace(".html", "")
+            return paper_id
+    def _get_paper_path(self, paper_id: str) -> Path:
+        """Get the file path for a cached paper."""
+        return self.output_dir / f"{paper_id}.md"
+    def _load_cached_paper(self, paper_id: str) -> str | None:
+        """Load cached paper if available."""
+        paper_path = self._get_paper_path(paper_id)
+        if paper_path.exists():
+            with open(paper_path, "r", encoding="utf-8") as f:
+                text = f.read()
+            logger.info(f"Loaded cached paper {paper_id} from {paper_path}")
+            return text
+        return None
+    def _save_paper(self, paper_id: str, markdown: str):
+        """Save processed paper to cache."""
+        paper_path = self._get_paper_path(paper_id)
+        with open(paper_path, "w", encoding="utf-8") as f:
+            f.write(markdown)
+        logger.info(f"Saved paper {paper_id} to {paper_path}")
+    def _fetch_markdown(self, arxiv_url: str) -> str:
+        """Fetch markdown from arxiv2md API."""
+        logger.info(f"Fetching markdown from arxiv2md API for {arxiv_url}")
+        # Prepare API parameters
+        params = {
+            "url": arxiv_url,
+            "remove_refs": "true",  # Remove references section (required)
+            "remove_toc": "true",  # Remove table of contents
+            "remove_citations": "true",  # Remove inline citations
+        }
+        try:
+            response = requests.get(self.API_BASE, params=params, timeout=60)
+            response.raise_for_status()
+            markdown = response.text
+            logger.info(f"Successfully fetched markdown ({len(markdown)} chars)")
+            return markdown
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error fetching from arxiv2md API: {e}")
+            raise Exception(f"Failed to fetch paper from arxiv2md: {e}")
+    def __call__(self, arxiv_url: str) -> str:
+        """Process an arXiv URL and return its markdown content.
+        Args:
+            arxiv_url: URL to the arXiv paper (e.g., https://arxiv.org/abs/2006.12834)
+        Returns:
+            Markdown text of the paper with references removed
+        """
+        logger.debug(f"Arxiv2MD({arxiv_url})")
+        # Extract paper ID
+        paper_id = self._extract_paper_id(arxiv_url)
+        # Check cache first
+        cached_text = self._load_cached_paper(paper_id)
+        if cached_text is not None:
+            return cached_text
+        # Fetch from API
+        markdown = self._fetch_markdown(arxiv_url)
+        # Save to cache
+        self._save_paper(paper_id, markdown)
+        return markdown

core/code_loader_demo.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""Standalone CodeLoader for loading and processing GitHub repositories."""
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import Callable
+import git
+import nbconvert
+import nbformat
+logger = logging.getLogger(__name__)
+class CodeLoader:
+    """Load and process GitHub repositories for code analysis."""
+    def __init__(
+        self,
+        github_url: str,
+        max_file_size_mb: float = 1.0,
+        raw_repo_dir: str | Path = "data/repos-raw",
+    ):
+        logger.info(
+            f"Initializing CodeLoader for {github_url} with max file size "
+            f"{max_file_size_mb} MB and raw repo dir {raw_repo_dir}"
+        )
+        self.github_url = github_url
+        self.max_file_size_mb = max_file_size_mb
+        self.raw_repo_dir = Path(raw_repo_dir)
+        self.repo_path = self.raw_repo_dir / self.github_url_to_repo_name
+        self.clone_repo()
+        self.files = self._get_files()
+    @property
+    def github_url_to_repo_name(self):
+        """Convert GitHub URL to a safe directory name."""
+        base_name = (
+            self.github_url.rstrip("/").split("/")[-2]
+            + "__"
+            + self.github_url.rstrip("/").split("/")[-1]
+        )
+        # Remove .git suffix if present
+        if base_name.endswith(".git"):
+            base_name = base_name[:-4]
+        return base_name
+    def clone_repo(self):
+        """Clone or validate existing repository."""
+        if self.repo_path.exists():
+            logger.info(f"Repository already exists at {self.repo_path}")
+            # Validate repository integrity
+            try:
+                repo = git.Repo(self.repo_path)
+                # Verify repository health
+                try:
+                    _ = repo.head.commit.hexsha
+                except (ValueError, git.BadName) as e:
+                    logger.warning(
+                        f"Repository has missing or corrupted commits at "
+                        f"{self.repo_path}, removing and re-cloning. Error: {e}"
+                    )
+                    shutil.rmtree(self.repo_path)
+                    self.clone_repo()  # Recursive call to re-clone
+                    return
+                logger.info("Repository already exists and is valid")
+                return
+            except (git.InvalidGitRepositoryError, git.GitCommandError) as e:
+                logger.warning(
+                    f"Invalid or corrupted git repository at {self.repo_path}, "
+                    f"removing and re-cloning. Error: {e}"
+                )
+                shutil.rmtree(self.repo_path)
+                self.clone_repo()  # Recursive call to re-clone
+                return
+        # Clone the repository
+        logger.info(f"Cloning repo {self.github_url} to {self.repo_path}")
+        self.raw_repo_dir.mkdir(parents=True, exist_ok=True)
+        repo = git.Repo.clone_from(self.github_url, str(self.repo_path))
+        # Clean up the repository
+        self._cleanup_repo()
+    def _cleanup_repo(self):
+        """Remove docs/test directories, convert notebooks, and remove large files."""
+        # Remove docs/test directories
+        for root, dirs, _ in os.walk(self.repo_path):
+            # CRITICAL: Skip .git directory
+            if ".git" in dirs:
+                dirs.remove(".git")
+            # Create a copy of dirs to avoid modification during iteration
+            dirs_to_remove = [
+                dir
+                for dir in dirs
+                if dir in ["docs", "doc", "test", "tests", "example", "examples"]
+            ]
+            for dir in dirs_to_remove:
+                dir_path = Path(root) / dir
+                logger.info(f"Removing directory: {dir_path}")
+                shutil.rmtree(dir_path)
+                dirs.remove(dir)
+        # Convert Jupyter notebooks to Python files
+        for root, dirs, files in os.walk(self.repo_path):
+            # Skip .git directory
+            if ".git" in dirs:
+                dirs.remove(".git")
+            for file in files:
+                if file.endswith(".ipynb"):
+                    logger.info(f"Converting Jupyter Notebook {file} to .py")
+                    try:
+                        nb = nbformat.read(Path(root) / file, as_version=4)
+                        # Clear outputs
+                        for cell in nb.cells:
+                            if cell.get("cell_type") == "code":
+                                cell["outputs"] = []
+                                cell["execution_count"] = None
+                        # Convert to .py
+                        exporter = nbconvert.PythonExporter()
+                        source, _ = exporter.from_notebook_node(nb)
+                        source = (
+                            "# This file was converted from a jupyter notebook "
+                            f"called {file}. All outputs have been removed.\n{source}"
+                        )
+                        with open(Path(root) / file.replace(".ipynb", ".py"), "w") as f:
+                            f.write(source)
+                        # Remove the original notebook
+                        os.remove(Path(root) / file)
+                    except Exception as e:
+                        logger.warning(f"Failed to convert notebook {file}: {e}")
+                        raise e
+        # Remove large files
+        for root, dirs, files in os.walk(self.repo_path):
+            # Skip .git directory
+            if ".git" in dirs:
+                dirs.remove(".git")
+            for file in files:
+                file_path = Path(root) / file
+                try:
+                    file_size = file_path.stat().st_size
+                except FileNotFoundError as e:
+                    logger.warning(f"Failed to get size of {file_path}: {e}")
+                    continue
+                if file_size > self.mb_to_bytes(self.max_file_size_mb):
+                    logger.info(f"Removing large file: {file_path}")
+                    os.remove(file_path)
+    def _get_files(self):
+        """Get all files from the repository."""
+        files = {}
+        for root, _, _files in os.walk(self.repo_path):
+            for file in _files:
+                file_path = Path(root) / file
+                if ".git" in str(file_path):
+                    continue
+                # Get relative path from repo root
+                file_path_key = file_path.relative_to(self.repo_path)
+                try:
+                    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                        content = f.read()
+                        files[str(file_path_key)] = content
+                except Exception as e:
+                    logger.warning(f"Could not read {file_path}: {e}")
+        # Order keys alphabetically
+        files = dict(sorted(files.items()))
+        return files
+    @staticmethod
+    def mb_to_bytes(mb: float) -> int:
+        """Convert megabytes to bytes."""
+        return int(mb * 1024 * 1024)
+    def get_files_by_extension(
+        self, extensions: list[str] | None = None
+    ) -> dict[str, str]:
+        """Get files filtered by extension."""
+        if extensions is None:
+            # Note: ipynb files are converted to .py during cleanup
+            extensions = [
+                ".c",
+                ".cc",
+                ".cpp",
+                ".cu",
+                ".h",
+                ".hpp",
+                ".java",
+                ".jl",
+                ".m",
+                ".matlab",
+                ".Makefile",
+                ".md",
+                ".pl",
+                ".ps1",
+                ".py",
+                ".r",
+                ".sh",
+                "config.txt",
+                ".rs",
+                "readme.txt",
+                "requirements_dev.txt",
+                "requirements-dev.txt",
+                "requirements.dev.txt",
+                "requirements.txt",
+                ".scala",
+                ".yaml",
+                ".yml",
+            ]
+        return {
+            k: v
+            for k, v in self.files.items()
+            if k.lower().endswith(tuple(extensions))
+        }
+    def get_repo_tree(self):
+        """Generate a tree representation of the repository."""
+        repo_tree = ""
+        for root, dirs, files in os.walk(self.repo_path):
+            # Exclude the .git directory
+            if ".git" in dirs:
+                dirs.remove(".git")
+            level = str(Path(root).relative_to(self.repo_path)).count(os.sep)
+            indent = "│   " * (level - 1) + "├── " if level > 0 else ""
+            # Don't print the starting path itself, just its contents
+            if level > 0:
+                repo_tree += f"{indent}{Path(root).name}/\n"
+            sub_indent = "│   " * level + "├── "
+            for f in files:
+                repo_tree += f"{sub_indent}{f}\n"
+        return repo_tree
+    def get_code_prompt(
+        self,
+        file_extensions: list[str] | None = None,
+        token_counter: Callable | None = None,
+        max_tokens: int | None = None,
+        code_changes: list[dict[str, str]] | None = None,
+    ) -> str:
+        """Generate code prompt with repo tree and file contents."""
+        code_prompt = "Repo tree:\n" + self.get_repo_tree() + "\n\n"
+        tokens = token_counter(code_prompt) if token_counter is not None else 0
+        files_to_replace = {}
+        if code_changes:
+            files_to_replace = {
+                cc["file_name"]: cc["discrepancy_code"] for cc in code_changes
+            }
+            logger.debug(
+                f"Files to replace: {len(files_to_replace)}: {files_to_replace.keys()}"
+            )
+        for file_path, file_content in self.get_files_by_extension(
+            file_extensions
+        ).items():
+            if file_path in files_to_replace:
+                logger.debug(f"Replacing code for {file_path} with changed code")
+                file_content = files_to_replace[file_path]
+            code_file = f"# ---\n# File: {file_path}\n# Content:\n{file_content}\n"
+            if token_counter is not None:
+                logger.debug(f"Adding file: {file_path}")
+                num_tokens = token_counter(code_file)
+                tokens += num_tokens
+                logger.debug(
+                    f"Number of tokens in file: {num_tokens}. "
+                    f"Total number of tokens in code prompt: {tokens}"
+                )
+            if max_tokens and tokens > max_tokens:
+                logger.warning(
+                    f"Truncating. Max tokens reached for {self.github_url}. "
+                    f"Max tokens for code is {max_tokens}"
+                )
+                break
+            code_prompt += code_file
+        return code_prompt

core/llm_demo.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""Standalone LLM client using litellm for multiple providers."""
+import logging
+import os
+from litellm import completion, completion_cost
+logger = logging.getLogger(__name__)
+class LLM:
+    """LLM client supporting multiple providers via litellm unified interface."""
+    def __init__(
+        self,
+        model: str,
+        api_key: str | None = None,
+        api_base: str | None = None,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        reasoning_effort: str = "high",
+        max_tokens: int | None = None,
+        max_context: int | None = None,
+    ):
+        """
+        Initialize LLM client.
+        Args:
+            model: Model identifier in litellm format (e.g., "gpt-4o", "claude-3-5-sonnet", "openrouter/nvidia/nemotron-3-nano-30b-a3b:free", "ollama/llama2")
+            api_key: API key (optional, can also be set via environment variable)
+            api_base: API base URL (for local models like Ollama/vLLM)
+            temperature: Sampling temperature
+            top_p: Top-p sampling parameter
+            reasoning_effort: Reasoning effort level ("high" for models that support it)
+            max_tokens: Maximum tokens to generate
+            max_context: Maximum context window size (required for Ollama models as num_ctx)
+        """
+        self.model = model
+        self.api_key = api_key
+        self.api_base = api_base
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.max_context = max_context
+        # Convert reasoning_effort to extra_body format
+        if reasoning_effort == "high":
+            self.extra_body = {"think": "high"}
+        else:
+            self.extra_body = {}
+        # Never log API keys - only log masked version
+        masked_key = f"{api_key[:8]}..." if api_key and len(api_key) > 8 else "None"
+        logger.info(f"Initialized LLM client for {model} (key: {masked_key}, api_base: {api_base})")
+    def __call__(self, prompt: str) -> dict:
+        """
+        Generate completion from prompt.
+        Args:
+            prompt: Input prompt text
+        Returns:
+            Response dictionary with 'choices' containing the generated text and 'cost' in metadata
+        """
+        # Never log the prompt if it might contain sensitive info
+        logger.debug(f"Calling LLM with prompt length: {len(prompt)} chars")
+        try:
+            # Build base kwargs - litellm handles provider detection automatically
+            kwargs = {
+                "model": self.model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "max_tokens": self.max_tokens,
+            }
+            # Set API key if provided
+            if self.api_key:
+                kwargs["api_key"] = self.api_key
+            # Set API base for local models
+            if self.api_base:
+                kwargs["api_base"] = self.api_base
+            # For Ollama models, set num_ctx (max context tokens)
+            if self.model.startswith("ollama/") and self.max_context:
+                kwargs["num_ctx"] = self.max_context
+                logger.debug(f"Using {self.max_context} tokens (num_ctx) for Ollama model {self.model}")
+            # Add extra_body for reasoning effort if specified
+            if self.extra_body:
+                kwargs["extra_body"] = self.extra_body
+            response = completion(**kwargs)
+            # Convert to dict format
+            if hasattr(response, "model_dump"):
+                result = response.model_dump()
+            else:
+                # Fallback for older litellm versions
+                result = {
+                    "choices": [
+                        {
+                            "message": {
+                                "content": response.choices[0].message.content
+                            }
+                        }
+                    ],
+                    "usage": response.usage.model_dump() if hasattr(response.usage, "model_dump") else {},
+                }
+            # Calculate cost using litellm
+            try:
+                cost = completion_cost(response)
+            except Exception as e:
+                logger.warning(f"Error calculating cost: {e}")
+                cost = 0.0
+            # Add cost to result metadata
+            if "metadata" not in result:
+                result["metadata"] = {}
+            result["metadata"]["cost"] = cost
+            logger.info(f"LLM call completed successfully (cost: ${cost:.4f})")
+            return result
+        except Exception as e:
+            # Never log API keys in error messages
+            error_msg = str(e)
+            # Remove any potential API key leaks from error messages
+            if self.api_key and self.api_key in error_msg:
+                error_msg = error_msg.replace(self.api_key, "***REDACTED***")
+            logger.error(f"Error calling LLM: {error_msg}")
+            raise Exception(f"LLM API error: {error_msg}") from e

core/model_config.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""Model configuration helpers and preset models."""
+from typing import Any
+# Preset provider models for quick selection
+PROVIDER_PRESETS = {
+    "GPT-5": {
+        "model": "gpt-5-2025-08-07",
+        "tokenizer": "openai/gpt-5-2025-08-07",
+        "max_context": 272000,
+        "api_key_env": "OPENAI_API_KEY",
+    },
+    "GPT-5 Mini": {
+        "model": "gpt-5-mini-2025-08-07",
+        "tokenizer": "openai/gpt-5-mini-2025-08-07",
+        "max_context": 272000,
+        "api_key_env": "OPENAI_API_KEY",
+    },
+    "GPT-5 Nano": {
+        "model": "gpt-5-nano-2025-08-07",
+        "tokenizer": "openai/gpt-5-nano-2025-08-07",
+        "max_context": 272000,
+        "api_key_env": "OPENAI_API_KEY",
+    },
+    "GPT-4o": {
+        "model": "gpt-4o",
+        "tokenizer": "openai/gpt-4o",
+        "max_context": 128000,
+        "api_key_env": "OPENAI_API_KEY",
+    },
+    "GPT-4 Turbo": {
+        "model": "gpt-4-turbo",
+        "tokenizer": "openai/gpt-4-turbo",
+        "max_context": 128000,
+        "api_key_env": "OPENAI_API_KEY",
+    },
+    "Claude 3.5 Sonnet": {
+        "model": "claude-3-5-sonnet-20241022",
+        "tokenizer": "anthropic/claude-3-5-sonnet",
+        "max_context": 200000,
+        "api_key_env": "ANTHROPIC_API_KEY",
+    },
+    "Claude 3 Opus": {
+        "model": "claude-3-opus-20240229",
+        "tokenizer": "anthropic/claude-3-opus",
+        "max_context": 200000,
+        "api_key_env": "ANTHROPIC_API_KEY",
+    },
+    "Gemini 3.0 Pro": {
+        "model": "gemini/gemini-3-pro-preview",
+        "tokenizer": "gemini/gemini-3-pro-preview",
+        "max_context": 2000000,
+        "api_key_env": "GEMINI_API_KEY",
+    },
+    "Gemini 3.0 Flash": {
+        "model": "gemini/gemini-3-flash-preview",
+        "tokenizer": "gemini/gemini-3-flash-preview",
+        "max_context": 1000000,
+        "api_key_env": "GEMINI_API_KEY",
+    },
+    "Gemini 2.5 Pro": {
+        "model": "gemini/gemini-2.5-pro",
+        "tokenizer": "gemini/gemini-2.5-pro",
+        "max_context": 2000000,
+        "api_key_env": "GEMINI_API_KEY",
+    },
+    "Gemini 2.5 Flash": {
+        "model": "gemini/gemini-2.5-flash",
+        "tokenizer": "gemini/gemini-2.5-flash",
+        "max_context": 1000000,
+        "api_key_env": "GEMINI_API_KEY",
+    },
+}
+def create_local_model_config(
+    model: str,
+    api_base: str | None = None,
+    max_context: int = 131072,
+    tokenizer: str | None = None,
+) -> dict[str, Any]:
+    """
+    Create a local model configuration.
+    Args:
+        model: Model name (e.g., "ollama/llama2" or "gpt-3.5-turbo" for vLLM)
+        api_base: API base URL (defaults based on model type)
+        max_context: Maximum context window size
+        tokenizer: Tokenizer name for token counting
+    Returns:
+        Model configuration dictionary
+    """
+    # Set default API base based on model type
+    if api_base is None:
+        if model.startswith("ollama/"):
+            api_base = "http://localhost:11434"
+        elif model.startswith("vllm/") or not model.startswith(("ollama/", "openrouter/")):
+            # Assume OpenAI-compatible (vLLM)
+            api_base = "http://localhost:8000/v1"
+    # Infer tokenizer if not provided
+    if tokenizer is None:
+        if model.startswith("ollama/"):
+            # Try to infer from model name
+            model_name = model.replace("ollama/", "")
+            tokenizer = f"hf/{model_name}"
+        else:
+            # For vLLM/OpenAI-compatible, try to infer
+            tokenizer = model.replace("vllm/", "")
+    return {
+        "type": "local",
+        "model": model,
+        "api_base": api_base,
+        "max_context": max_context,
+        "tokenizer": tokenizer,
+    }
+def create_provider_model_config(
+    model: str,
+    api_key: str,
+    max_context: int = 128000,
+    tokenizer: str | None = None,
+) -> dict[str, Any]:
+    """
+    Create a provider model configuration.
+    Args:
+        model: Model name in litellm format
+        api_key: API key for the provider
+        max_context: Maximum context window size
+        tokenizer: Tokenizer name for token counting
+    Returns:
+        Model configuration dictionary
+    """
+    # Infer tokenizer if not provided
+    if tokenizer is None:
+        # Try to infer from model name
+        if model.startswith("openai/") or not "/" in model:
+            # OpenAI models
+            model_name = model.replace("openai/", "")
+            tokenizer = f"openai/{model_name}"
+        elif model.startswith("anthropic/") or model.startswith("claude-"):
+            # Anthropic models
+            model_name = model.replace("anthropic/", "")
+            tokenizer = f"anthropic/{model_name}"
+        elif model.startswith("gemini/"):
+            # Gemini models
+            model_name = model.replace("gemini/", "")
+            tokenizer = f"gemini/{model_name}"
+        else:
+            # Generic fallback
+            tokenizer = "gpt2"
+    return {
+        "type": "provider",
+        "model": model,
+        "api_key": api_key,
+        "max_context": max_context,
+        "tokenizer": tokenizer,
+    }
+def get_provider_from_model(model: str) -> str:
+    """
+    Infer provider name from model identifier.
+    Args:
+        model: Model name in litellm format
+    Returns:
+        Provider name hint (e.g., "openai", "anthropic", "gemini")
+    """
+    model_lower = model.lower()
+    if model_lower.startswith("openai/") or not "/" in model:
+        return "openai"
+    elif model_lower.startswith("anthropic/") or model_lower.startswith("claude-"):
+        return "anthropic"
+    elif model_lower.startswith("gemini/"):
+        return "gemini"
+    elif model_lower.startswith("openrouter/"):
+        return "openrouter"
+    elif model_lower.startswith("cohere/"):
+        return "cohere"
+    elif model_lower.startswith("mistral/"):
+        return "mistral"
+    else:
+        return "other"
+def get_api_key_env_name(provider: str) -> str:
+    """
+    Get the environment variable name for API key based on provider.
+    Args:
+        provider: Provider name
+    Returns:
+        Environment variable name for API key
+    """
+    provider_to_key = {
+        "openai": "OPENAI_API_KEY",
+        "anthropic": "ANTHROPIC_API_KEY",
+        "gemini": "GEMINI_API_KEY",
+        "openrouter": "OPENROUTER_API_KEY",
+        "cohere": "COHERE_API_KEY",
+        "mistral": "MISTRAL_API_KEY",
+        "other": "API_KEY",
+    }
+    return provider_to_key.get(provider.lower(), "API_KEY")

core/ollama_models.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Helper functions to query Ollama API for available models."""
+import logging
+from typing import Any
+import requests
+logger = logging.getLogger(__name__)
+def fetch_ollama_models(api_base: str) -> list[str]:
+    """
+    Fetch available models from Ollama API.
+    Args:
+        api_base: Ollama API base URL (e.g., "http://localhost:11434")
+    Returns:
+        List of model names available on the Ollama server
+    """
+    try:
+        # Ollama API endpoint for listing models
+        url = f"{api_base.rstrip('/')}/api/tags"
+        response = requests.get(url, timeout=5)
+        response.raise_for_status()
+        data = response.json()
+        models = data.get("models", [])
+        # Extract model names
+        model_names = [model.get("name", "") for model in models if model.get("name")]
+        logger.info(f"Fetched {len(model_names)} models from Ollama at {api_base}")
+        return model_names
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching models from Ollama: {e}")
+        return []
+    except Exception as e:
+        logger.error(f"Unexpected error fetching models from Ollama: {e}")
+        return []

core/openrouter_models.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""Helper functions to fetch and filter free models from OpenRouter API."""
+import json
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Any
+import requests
+logger = logging.getLogger(__name__)
+OPENROUTER_API_URL = "https://openrouter.ai/api/v1/models"
+CACHE_DIR = Path(".cache")
+CACHE_FILE = CACHE_DIR / "openrouter_models.json"
+CACHE_DURATION_SECONDS = 24 * 60 * 60  # 24 hours
+def is_free_model(model: dict[str, Any]) -> bool:
+    """
+    Check if a model is free based on its ID or pricing.
+    Args:
+        model: Model dictionary from OpenRouter API
+    Returns:
+        True if the model is free, False otherwise
+    """
+    model_id = model.get("id", "")
+    # Check if model has :free suffix
+    if ":free" in model_id:
+        return True
+    # Check if pricing is zero or null
+    pricing = model.get("pricing", {})
+    prompt_price = pricing.get("prompt", "0")
+    completion_price = pricing.get("completion", "0")
+    # Convert to float if possible, otherwise check if it's "0" or null
+    try:
+        prompt_price_float = float(prompt_price) if prompt_price else 0.0
+        completion_price_float = float(completion_price) if completion_price else 0.0
+        return prompt_price_float == 0.0 and completion_price_float == 0.0
+    except (ValueError, TypeError):
+        # If conversion fails, check if both are "0" or null/empty
+        return (prompt_price in ["0", None, ""] and
+                completion_price in ["0", None, ""])
+def _load_cache() -> tuple[list[dict[str, Any]] | None, float | None]:
+    """
+    Load cached models from file.
+    Returns:
+        Tuple of (cached_models, cache_timestamp) or (None, None) if cache doesn't exist or is invalid
+    """
+    if not CACHE_FILE.exists():
+        return None, None
+    try:
+        with open(CACHE_FILE, "r", encoding="utf-8") as f:
+            cache_data = json.load(f)
+        cached_models = cache_data.get("models", None)
+        cache_timestamp = cache_data.get("timestamp", None)
+        if cached_models is None or cache_timestamp is None:
+            return None, None
+        return cached_models, cache_timestamp
+    except (json.JSONDecodeError, IOError) as e:
+        logger.warning(f"Error loading cache: {e}")
+        return None, None
+def _save_cache(models: list[dict[str, Any]]) -> None:
+    """
+    Save models to cache file.
+    Args:
+        models: List of model dictionaries to cache
+    """
+    try:
+        CACHE_DIR.mkdir(parents=True, exist_ok=True)
+        cache_data = {
+            "models": models,
+            "timestamp": time.time(),
+        }
+        with open(CACHE_FILE, "w", encoding="utf-8") as f:
+            json.dump(cache_data, f)
+        logger.info(f"Cached {len(models)} free models to {CACHE_FILE}")
+    except IOError as e:
+        logger.warning(f"Error saving cache: {e}")
+def fetch_free_models() -> list[dict[str, Any]]:
+    """
+    Fetch all free models from OpenRouter API.
+    Uses file-based cache that refreshes once per day.
+    Returns:
+        List of free model dictionaries with metadata
+    """
+    # Check cache first
+    cached_models, cache_timestamp = _load_cache()
+    if cached_models is not None and cache_timestamp is not None:
+        # Check if cache is still valid (less than 24 hours old)
+        age_seconds = time.time() - cache_timestamp
+        if age_seconds < CACHE_DURATION_SECONDS:
+            logger.info(f"Using cached models (age: {age_seconds / 3600:.1f} hours)")
+            return cached_models
+        else:
+            logger.info(f"Cache expired (age: {age_seconds / 3600:.1f} hours), fetching fresh data")
+    # Cache is invalid or doesn't exist, fetch from API
+    try:
+        # OpenRouter API doesn't require authentication for listing models
+        response = requests.get(OPENROUTER_API_URL, timeout=10)
+        response.raise_for_status()
+        data = response.json()
+        models = data.get("data", [])
+        # Filter to only free models
+        free_models = [model for model in models if is_free_model(model)]
+        logger.info(f"Fetched {len(free_models)} free models from OpenRouter")
+        # Save to cache
+        _save_cache(free_models)
+        return free_models
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching models from OpenRouter: {e}")
+        # If API call fails but we have cached data, return cached data even if expired
+        if cached_models is not None:
+            logger.warning("API call failed, using expired cache as fallback")
+            return cached_models
+        return []
+    except Exception as e:
+        logger.error(f"Unexpected error fetching models: {e}")
+        # If API call fails but we have cached data, return cached data even if expired
+        if cached_models is not None:
+            logger.warning("Unexpected error, using expired cache as fallback")
+            return cached_models
+        return []
+def get_model_config(model: dict[str, Any]) -> dict[str, Any]:
+    """
+    Extract model configuration from OpenRouter API response.
+    Args:
+        model: Model dictionary from OpenRouter API
+    Returns:
+        Model configuration dictionary with type, model, max_context, tokenizer
+    """
+    model_id = model.get("id", "")
+    context_length = model.get("context_length")
+    architecture = model.get("architecture", {})
+    tokenizer_group = architecture.get("tokenizer", "")
+    # Infer tokenizer from model ID
+    tokenizer = None
+    hugging_face_id = model.get("hugging_face_id")
+    # Use Hugging Face ID if available
+    if hugging_face_id:
+        tokenizer = f"hf/{hugging_face_id}"
+    else:
+        # Try to construct tokenizer name from model ID
+        # For example: "nvidia/nemotron-3-nano-30b-a3b:free" -> "hf/nvidia/nemotron-3-nano-30b-a3b"
+        parts = model_id.split("/")
+        if len(parts) > 1:
+            org = parts[0]
+            model_name = parts[-1].split(":")[0]  # Remove :free suffix
+            tokenizer = f"hf/{org}/{model_name}"
+        else:
+            # Single part model ID
+            model_name = model_id.split(":")[0]
+            tokenizer = f"hf/{model_name}"
+    # Fallback to a generic tokenizer if we can't infer
+    if not tokenizer:
+        tokenizer = "gpt2"  # Generic fallback
+    # Default context length if not provided
+    if context_length is None:
+        context_length = 131072
+    return {
+        "type": "free_openrouter",
+        "model": f"openrouter/{model_id}",  # litellm format
+        "max_context": context_length,
+        "tokenizer": tokenizer,
+        "model_id": model_id,
+        "name": model.get("name", model_id),
+        "description": model.get("description", ""),
+    }

core/prompt_demo.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""Standalone prompt template loader."""
+import logging
+import string
+logger = logging.getLogger(__name__)
+# Embedded discrepancy_generation prompt template
+DISCREPANCY_GENERATION_PROMPT = """You are an expert in analyzing scientific papers and their code implementations.
+Your task is to carefully identify concrete discrepancies between what is described in a paper and what is actually implemented in the code.
+## What counts as a discrepancy
+- A concrete paper–code discrepancy means a mismatch between what is stated in the original paper (e.g., formulas, algorithms, logic, methods, processes, or other settings) and what is implemented in the original code repository.
+- Each distinct mismatch should be reported as a separate item.
+## What does not count as a discrepancy
+- Missing information in the paper like hyperparameters (e.g., "the authors did not specify X").
+- Hyperparameter mismatches (e.g., learning rate, batch size, dropout rate), since these are typically configurable in code repository.
+- Missing implementation in the original code repository (e.g., "the authors did not provide the code for X").
+- Bugs or errors in the code that are unrelated to what the paper describes.
+## Output format
+Provide your findings in the following YAML structure:
+```yaml
+discrepancies:
+  - <a summary of the discrepancy between the paper and the code in 3-8 sentences. Your description should contain three parts focusing on the discrepancy: 1) summarize what is described in the paper, 2) summarize what is implemented in the code, and 3) summarize the difference. Do not speculate about the impact.>
+  - <if there are multiple discrepancies, put each of them in a separate item.>
+```
+## Paper
+${paper}
+## Code
+${code}
+"""
+class Prompt:
+    """Prompt template handler."""
+    def __init__(self, template: str = "discrepancy_generation"):
+        """
+        Initialize prompt template.
+        Args:
+            template: Template name (currently only "discrepancy_generation" is supported)
+        """
+        self.template = template
+        if template == "discrepancy_generation":
+            self.prompt_template = DISCREPANCY_GENERATION_PROMPT
+        else:
+            raise ValueError(f"Template '{template}' not found. Available: 'discrepancy_generation'")
+        # Create Template object for variable substitution
+        self.prompt = string.Template(self.prompt_template)
+        # Extract variables from the template
+        self.prompt_vars = list(self.prompt.get_identifiers())
+    def __call__(self, **kwargs) -> str:
+        """
+        Substitute variables in the prompt template.
+        Args:
+            **kwargs: Variables to substitute (e.g., paper, code)
+        Returns:
+            Formatted prompt string
+        """
+        # Remove any '<|endoftext|>' from the kwargs
+        for k, v in kwargs.items():
+            if isinstance(v, str) and "<|endoftext|>" in v:
+                kwargs[k] = v.replace("<|endoftext|>", "endoftext")
+        return self.prompt.safe_substitute(**kwargs)

core/token_counter_demo.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""Standalone token counter using litellm."""
+import logging
+from litellm import token_counter
+logger = logging.getLogger(__name__)
+class TokenCounter:
+    """Token counter for various model types using litellm."""
+    def __init__(self, model: str):
+        """
+        Initialize token counter.
+        Args:
+            model: Model identifier (e.g., "gpt-4", "claude-3-5-sonnet", etc.)
+        """
+        self.model = model
+        logger.info(f"Using litellm token counter for {self.model}")
+    def __call__(self, text: str) -> int:
+        """Count tokens in text using litellm."""
+        if len(text) == 0:
+            return 0
+        try:
+            return token_counter(model=self.model, text=text)
+        except Exception as e:
+            logger.warning(f"Error counting tokens with litellm: {e}")
+            # Fallback: rough estimate (1 token ≈ 4 characters)
+            return len(text) // 4

parsing.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""Discrepancy parsing logic for extracting discrepancies from LLM output."""
+import logging
+import re
+logger = logging.getLogger(__name__)
+def parse_discrepancies(text: str) -> list[str] | None:
+    """
+    Extract list items (discrepancies) from model output.
+    Replicates the _extract_list_items logic from scicoqa/inference/discrepancy_eval.py
+    Args:
+        text: Raw text output from LLM
+    Returns:
+        List of discrepancy strings, or None if no discrepancies found
+    """
+    if not text:
+        return None
+    # Remove redacted reasoning if present
+    if "</think>" in text:
+        text = text.split("</think>")[1]
+    # Detect YAML or dashed list format
+    if "```yaml\ndiscrepancies:" in text:
+        text = text.split("```yaml\ndiscrepancies:")[-1]
+        yaml_or_dashed = True
+    elif "```yaml" in text:
+        text = text.split("```yaml")[-1]
+        yaml_or_dashed = True
+    elif "discrepancies:" in text:
+        text = text.split("discrepancies:")[1]
+        yaml_or_dashed = True
+    elif re.search(r"# Discrepancies[\s\r\n]*-", text, re.IGNORECASE):
+        text = re.split(
+            r"# Discrepancies[\s\r\n]*-", text, maxsplit=1, flags=re.IGNORECASE
+        )[1]
+        text = "- " + text
+        yaml_or_dashed = True
+    else:
+        yaml_or_dashed = False
+    if yaml_or_dashed:
+        # Clean up the text
+        text = text.strip("\n").strip().strip("```yaml").strip("```").strip("\n")
+        text = (
+            text.strip("discrepancies:").strip("discrepancies").strip("\n").strip()
+        )
+        # Split by list item pattern
+        pattern = r"\n\s{0,2}-\s+"
+        parts = re.split(pattern, text)
+        items = []
+        for part in parts:
+            cleaned = " ".join(part.split())
+            if cleaned and not cleaned.startswith("discrepancies:"):
+                # Multiple cleaning passes
+                cleaned = cleaned.strip().strip("-").strip()
+                cleaned = cleaned.strip().strip("-").strip()
+                cleaned = cleaned.strip().strip("|").strip()
+                cleaned = cleaned.strip().strip(">-").strip()
+                cleaned = cleaned.strip().strip(">").strip()
+                cleaned = cleaned.strip().strip('"').strip()
+                cleaned = cleaned.strip().strip("'").strip()
+                cleaned = cleaned.strip("summary: |\n")
+                cleaned = cleaned.strip("summary: ")
+                cleaned = cleaned.strip("|")
+                cleaned = cleaned.strip("\n").strip()
+                # Remove numbered prefixes
+                cleaned = re.sub(r"^[0-9]+[\.\)]\s*", "", cleaned)
+                if cleaned:  # Only add non-empty items
+                    items.append(cleaned)
+    else:
+        items = None
+    # Handle empty list case
+    if items and len(items) == 1 and items[0].strip() == "[]":
+        items = None
+    return items if items else None

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+streamlit>=1.28.0
+litellm>=1.17.0
+requests>=2.31.0
+gitpython>=3.1.40
+pyyaml>=6.0
+python-dotenv>=1.0.0
+nbconvert>=7.10.0
+nbformat>=5.9.0
+tqdm>=4.66.0