LeannJoy commited on
Commit
243528d
·
verified ·
1 Parent(s): 6e2134f

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +59 -46
utils.py CHANGED
@@ -1,70 +1,83 @@
1
  import pandas as pd
2
- import io
3
  import csv
4
- from langchain.agents.agent_types import AgentType
5
- from langchain_community.llms import HuggingFaceHub
6
- from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
 
 
 
7
 
8
- # Use a highly capable open-source instruction model for better reasoning
9
- # Mistral-7B-Instruct-v0.2 is a strong choice for this agent.
10
- MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
11
 
12
- def detect_csv_delimiter(uploaded_file_bytes):
13
- """
14
- Detects the delimiter of the CSV file by reading a sample.
15
- Handles various delimiters (comma, semicolon, tab, pipe).
16
- """
17
  try:
18
- # Decode the first chunk of the file to a string
19
- sample = uploaded_file_bytes.getvalue().decode('utf-8').splitlines()[0]
20
- # Use csv.Sniffer to guess the dialect/delimiter
21
- dialect = csv.Sniffer().sniff(sample)
 
 
 
22
  return dialect.delimiter
23
- except Exception as e:
24
- print(f"Error detecting delimiter, defaulting to comma: {e}")
25
  return ','
26
 
27
- def query_agent(uploaded_file_bytes, user_query):
28
  """
29
- Initializes and runs the LangChain Pandas DataFrame Agent.
 
 
 
 
 
30
 
31
- It first detects the delimiter, loads the DataFrame, sets up the LLM,
32
- and executes the query.
33
  """
34
- # 1. Delimiter Detection & DataFrame Loading
35
- delimiter = detect_csv_delimiter(uploaded_file_bytes)
36
-
37
- # Reset stream position to the beginning before reading with pandas
38
- uploaded_file_bytes.seek(0)
39
-
40
- try:
41
- # Read the file using the detected delimiter
42
- df = pd.read_csv(uploaded_file_bytes, sep=delimiter, encoding='utf-8')
43
- except Exception as e:
44
- return f"Error loading data with detected delimiter '{delimiter}': {e}"
45
 
46
- # 2. Initialize HuggingFace LLM
47
  try:
48
- # Note: HUGGINGFACEHUB_API_TOKEN must be set in the environment
49
- llm = HuggingFaceHub(
50
- repo_id=MODEL_ID,
51
- model_kwargs={"temperature": 0.1, "max_length": 512}
 
 
 
 
 
 
 
52
  )
53
- except Exception as e:
54
- return f"Error initializing LLM (HuggingFaceHub). Ensure API key is set and valid: {e}"
55
 
56
- # 3. Create LangChain Pandas Agent
57
- try:
58
  agent = create_pandas_dataframe_agent(
59
  llm,
60
  df,
61
  verbose=True,
62
- agent_type=AgentType.OPENAI_FUNCTIONS, # Using OPENAI_FUNCTIONS for structured response, compatible with HuggingFaceHub and best for structured reasoning
63
- handle_parsing_errors=True
 
 
 
 
 
 
64
  )
65
 
66
  # 4. Run the query
67
- response = agent.run(user_query)
 
68
  return response
 
69
  except Exception as e:
70
- return f"An error occurred during agent execution. The model might not have enough context or the query caused a parsing issue. Error: {e}"
 
 
1
  import pandas as pd
 
2
  import csv
3
+ import io
4
+ # Changed LLM import to HuggingFaceEndpoint
5
+ from langchain_community.llms import HuggingFaceEndpoint
6
+ from langchain_community.agent_toolkits import create_pandas_dataframe_agent
7
+ from dotenv import load_dotenv
8
+ import os
9
 
10
+ # Load environment variables from .env file
11
+ load_dotenv()
 
12
 
13
+ # --- Hugging Face Model Configuration ---
14
+ HF_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"
15
+
16
+ def detect_delimiter(file_content: bytes) -> str:
17
+ """Detects the delimiter of a CSV file content."""
18
  try:
19
+ # Decode the first few lines to sample the content
20
+ sample = file_content.decode('utf-8').splitlines()[:5]
21
+ if not sample:
22
+ return ',' # Default to comma if empty
23
+
24
+ # Use csv.Sniffer to guess the dialect (and thus the delimiter)
25
+ dialect = csv.Sniffer().sniff('\n'.join(sample))
26
  return dialect.delimiter
27
+ except Exception:
28
+ # Fallback to a comma if sniffing fails
29
  return ','
30
 
31
+ def query_agent(uploaded_file_content: bytes, query: str, hf_api_token: str) -> str:
32
  """
33
+ Initializes a LangChain Pandas Agent and processes a natural language query using a Hugging Face LLM.
34
+
35
+ Args:
36
+ uploaded_file_content: The byte content of the uploaded CSV file.
37
+ query: The natural language question from the user.
38
+ hf_api_token: The API token for the Hugging Face Hub.
39
 
40
+ Returns:
41
+ The response generated by the agent.
42
  """
43
+ if not hf_api_token:
44
+ # Updated error message for Hugging Face
45
+ return "Error: HUGGINGFACEHUB_API_TOKEN is not configured."
 
 
 
 
 
 
 
 
46
 
 
47
  try:
48
+ # 1. Robustly read CSV content using detected delimiter
49
+ delimiter = detect_delimiter(uploaded_file_content)
50
+ data_io = io.StringIO(uploaded_file_content.decode('utf-8'))
51
+ df = pd.read_csv(data_io, sep=delimiter)
52
+
53
+ # 2. Initialize the LLM using HuggingFaceEndpoint
54
+ llm = HuggingFaceEndpoint(
55
+ repo_id=HF_REPO_ID,
56
+ huggingfacehub_api_token=hf_api_token,
57
+ temperature=0.0, # Keep reasoning deterministic
58
+ max_new_tokens=512
59
  )
 
 
60
 
61
+ # 3. Create the Pandas DataFrame Agent
 
62
  agent = create_pandas_dataframe_agent(
63
  llm,
64
  df,
65
  verbose=True,
66
+ # Include a system prompt to guide the agent's behavior
67
+ agent_kwargs={
68
+ "system_message": (
69
+ "You are an expert data analysis assistant. You are interacting with a pandas DataFrame "
70
+ "named 'df'. Use Python code only to answer questions about the data. "
71
+ "Do not make up facts. Always show the code you executed before giving the final answer."
72
+ )
73
+ }
74
  )
75
 
76
  # 4. Run the query
77
+ response = agent.run(query)
78
+
79
  return response
80
+
81
  except Exception as e:
82
+ # Catch and report any exceptions during processing
83
+ return f"An error occurred during analysis: {e}"