File size: 3,252 Bytes
ae15120
3b29588
243528d
 
c01a6a6
 
243528d
 
3b29588
243528d
 
3b29588
243528d
 
 
 
 
a0cd7aa
243528d
 
 
 
 
 
 
3b29588
243528d
 
3b29588
 
243528d
3b29588
243528d
 
 
 
 
 
3b29588
243528d
 
3b29588
243528d
 
 
ae15120
3b29588
243528d
 
 
 
 
 
 
 
 
 
 
3b29588
582aac3
243528d
c01a6a6
 
582aac3
3b29588
582aac3
3b29588
c01a6a6
243528d
 
 
 
 
 
 
 
582aac3
 
3b29588
243528d
 
582aac3
243528d
582aac3
243528d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import csv
import io
from langchain_community.llms import HuggingFaceEndpoint
# FIX: Changed import path from langchain_community to langchain_experimental
from langchain_experimental.agents import create_pandas_dataframe_agent
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# --- Hugging Face Model Configuration ---
HF_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"

def detect_delimiter(file_content: bytes) -> str:
    """Detects the delimiter of a CSV file content."""
    try:
        # Decode the first few lines to sample the content
        sample = file_content.decode('utf-8').splitlines()[:5]
        if not sample:
            return ',' # Default to comma if empty

        # Use csv.Sniffer to guess the dialect (and thus the delimiter)
        dialect = csv.Sniffer().sniff('\n'.join(sample))
        return dialect.delimiter
    except Exception:
        # Fallback to a comma if sniffing fails
        return ','

def query_agent(uploaded_file_content: bytes, query: str, hf_api_token: str) -> str:
    """
    Initializes a LangChain Pandas Agent and processes a natural language query using a Hugging Face LLM.

    Args:
        uploaded_file_content: The byte content of the uploaded CSV file.
        query: The natural language question from the user.
        hf_api_token: The API token for the Hugging Face Hub.

    Returns:
        The response generated by the agent.
    """
    if not hf_api_token:
        # Updated error message for Hugging Face
        return "Error: HUGGINGFACEHUB_API_TOKEN is not configured."

    try:
        # 1. Robustly read CSV content using detected delimiter
        delimiter = detect_delimiter(uploaded_file_content)
        data_io = io.StringIO(uploaded_file_content.decode('utf-8'))
        df = pd.read_csv(data_io, sep=delimiter)

        # 2. Initialize the LLM using HuggingFaceEndpoint
        llm = HuggingFaceEndpoint(
            repo_id=HF_REPO_ID,
            huggingfacehub_api_token=hf_api_token,
            temperature=0.0, # Keep reasoning deterministic
            max_new_tokens=512
        )

        # 3. Create the Pandas DataFrame Agent
        # CRITICAL SECURITY NOTE: Must set allow_dangerous_code=True 
        # as LangChain now requires an explicit opt-in for code execution agents.
        agent = create_pandas_dataframe_agent(
            llm,
            df,
            verbose=True,
            allow_dangerous_code=True, # Added to prevent runtime ValueError
            # Include a system prompt to guide the agent's behavior
            agent_kwargs={
                "system_message": (
                    "You are an expert data analysis assistant. You are interacting with a pandas DataFrame "
                    "named 'df'. Use Python code only to answer questions about the data. "
                    "Do not make up facts. Always show the code you executed before giving the final answer."
                )
            }
        )

        # 4. Run the query
        response = agent.run(query)

        return response

    except Exception as e:
        # Catch and report any exceptions during processing
        return f"An error occurred during analysis: {e}"