Spaces:
Sleeping
Sleeping
File size: 3,252 Bytes
ae15120 3b29588 243528d c01a6a6 243528d 3b29588 243528d 3b29588 243528d a0cd7aa 243528d 3b29588 243528d 3b29588 243528d 3b29588 243528d 3b29588 243528d 3b29588 243528d ae15120 3b29588 243528d 3b29588 582aac3 243528d c01a6a6 582aac3 3b29588 582aac3 3b29588 c01a6a6 243528d 582aac3 3b29588 243528d 582aac3 243528d 582aac3 243528d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import pandas as pd
import csv
import io
from langchain_community.llms import HuggingFaceEndpoint
# FIX: Changed import path from langchain_community to langchain_experimental
from langchain_experimental.agents import create_pandas_dataframe_agent
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()
# --- Hugging Face Model Configuration ---
HF_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"
def detect_delimiter(file_content: bytes) -> str:
"""Detects the delimiter of a CSV file content."""
try:
# Decode the first few lines to sample the content
sample = file_content.decode('utf-8').splitlines()[:5]
if not sample:
return ',' # Default to comma if empty
# Use csv.Sniffer to guess the dialect (and thus the delimiter)
dialect = csv.Sniffer().sniff('\n'.join(sample))
return dialect.delimiter
except Exception:
# Fallback to a comma if sniffing fails
return ','
def query_agent(uploaded_file_content: bytes, query: str, hf_api_token: str) -> str:
"""
Initializes a LangChain Pandas Agent and processes a natural language query using a Hugging Face LLM.
Args:
uploaded_file_content: The byte content of the uploaded CSV file.
query: The natural language question from the user.
hf_api_token: The API token for the Hugging Face Hub.
Returns:
The response generated by the agent.
"""
if not hf_api_token:
# Updated error message for Hugging Face
return "Error: HUGGINGFACEHUB_API_TOKEN is not configured."
try:
# 1. Robustly read CSV content using detected delimiter
delimiter = detect_delimiter(uploaded_file_content)
data_io = io.StringIO(uploaded_file_content.decode('utf-8'))
df = pd.read_csv(data_io, sep=delimiter)
# 2. Initialize the LLM using HuggingFaceEndpoint
llm = HuggingFaceEndpoint(
repo_id=HF_REPO_ID,
huggingfacehub_api_token=hf_api_token,
temperature=0.0, # Keep reasoning deterministic
max_new_tokens=512
)
# 3. Create the Pandas DataFrame Agent
# CRITICAL SECURITY NOTE: Must set allow_dangerous_code=True
# as LangChain now requires an explicit opt-in for code execution agents.
agent = create_pandas_dataframe_agent(
llm,
df,
verbose=True,
allow_dangerous_code=True, # Added to prevent runtime ValueError
# Include a system prompt to guide the agent's behavior
agent_kwargs={
"system_message": (
"You are an expert data analysis assistant. You are interacting with a pandas DataFrame "
"named 'df'. Use Python code only to answer questions about the data. "
"Do not make up facts. Always show the code you executed before giving the final answer."
)
}
)
# 4. Run the query
response = agent.run(query)
return response
except Exception as e:
# Catch and report any exceptions during processing
return f"An error occurred during analysis: {e}" |