Spaces:

LeannJoy
/

DataAnalysisApp

Sleeping

File size: 3,252 Bytes

import pandas as pd
import csv
import io
from langchain_community.llms import HuggingFaceEndpoint
# FIX: Changed import path from langchain_community to langchain_experimental
from langchain_experimental.agents import create_pandas_dataframe_agent
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# --- Hugging Face Model Configuration ---
HF_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"

def detect_delimiter(file_content: bytes) -> str:
    """Detects the delimiter of a CSV file content."""
    try:
        # Decode the first few lines to sample the content
        sample = file_content.decode('utf-8').splitlines()[:5]
        if not sample:
            return ',' # Default to comma if empty

        # Use csv.Sniffer to guess the dialect (and thus the delimiter)
        dialect = csv.Sniffer().sniff('\n'.join(sample))
        return dialect.delimiter
    except Exception:
        # Fallback to a comma if sniffing fails
        return ','

def query_agent(uploaded_file_content: bytes, query: str, hf_api_token: str) -> str:
    """
    Initializes a LangChain Pandas Agent and processes a natural language query using a Hugging Face LLM.

    Args:
        uploaded_file_content: The byte content of the uploaded CSV file.
        query: The natural language question from the user.
        hf_api_token: The API token for the Hugging Face Hub.

    Returns:
        The response generated by the agent.
    """
    if not hf_api_token:
        # Updated error message for Hugging Face
        return "Error: HUGGINGFACEHUB_API_TOKEN is not configured."

    try:
        # 1. Robustly read CSV content using detected delimiter
        delimiter = detect_delimiter(uploaded_file_content)
        data_io = io.StringIO(uploaded_file_content.decode('utf-8'))
        df = pd.read_csv(data_io, sep=delimiter)

        # 2. Initialize the LLM using HuggingFaceEndpoint
        llm = HuggingFaceEndpoint(
            repo_id=HF_REPO_ID,
            huggingfacehub_api_token=hf_api_token,
            temperature=0.0, # Keep reasoning deterministic
            max_new_tokens=512
        )

        # 3. Create the Pandas DataFrame Agent
        # CRITICAL SECURITY NOTE: Must set allow_dangerous_code=True 
        # as LangChain now requires an explicit opt-in for code execution agents.
        agent = create_pandas_dataframe_agent(
            llm,
            df,
            verbose=True,
            allow_dangerous_code=True, # Added to prevent runtime ValueError
            # Include a system prompt to guide the agent's behavior
            agent_kwargs={
                "system_message": (
                    "You are an expert data analysis assistant. You are interacting with a pandas DataFrame "
                    "named 'df'. Use Python code only to answer questions about the data. "
                    "Do not make up facts. Always show the code you executed before giving the final answer."
                )
            }
        )

        # 4. Run the query
        response = agent.run(query)

        return response

    except Exception as e:
        # Catch and report any exceptions during processing
        return f"An error occurred during analysis: {e}"