Spaces:

LeannJoy
/

DataAnalysisApp

Sleeping

App Files Files Community

LeannJoy commited on Oct 17, 2025

Commit

243528d

verified ·

1 Parent(s): 6e2134f

Update utils.py

Browse files

Files changed (1) hide show

utils.py +59 -46

utils.py CHANGED Viewed

@@ -1,70 +1,83 @@
 import pandas as pd
-import io
 import csv
-from langchain.agents.agent_types import AgentType
-from langchain_community.llms import HuggingFaceHub
-from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
-# Use a highly capable open-source instruction model for better reasoning
-# Mistral-7B-Instruct-v0.2 is a strong choice for this agent.
-MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
-def detect_csv_delimiter(uploaded_file_bytes):
-    """
-    Detects the delimiter of the CSV file by reading a sample.
-    Handles various delimiters (comma, semicolon, tab, pipe).
-    """
     try:
-        # Decode the first chunk of the file to a string
-        sample = uploaded_file_bytes.getvalue().decode('utf-8').splitlines()[0]
-        # Use csv.Sniffer to guess the dialect/delimiter
-        dialect = csv.Sniffer().sniff(sample)
         return dialect.delimiter
-    except Exception as e:
-        print(f"Error detecting delimiter, defaulting to comma: {e}")
         return ','
-def query_agent(uploaded_file_bytes, user_query):
     """
-    Initializes and runs the LangChain Pandas DataFrame Agent.
-    It first detects the delimiter, loads the DataFrame, sets up the LLM,
-    and executes the query.
     """
-    # 1. Delimiter Detection & DataFrame Loading
-    delimiter = detect_csv_delimiter(uploaded_file_bytes)
-    # Reset stream position to the beginning before reading with pandas
-    uploaded_file_bytes.seek(0)
-    try:
-        # Read the file using the detected delimiter
-        df = pd.read_csv(uploaded_file_bytes, sep=delimiter, encoding='utf-8')
-    except Exception as e:
-        return f"Error loading data with detected delimiter '{delimiter}': {e}"
-    # 2. Initialize HuggingFace LLM
     try:
-        # Note: HUGGINGFACEHUB_API_TOKEN must be set in the environment
-        llm = HuggingFaceHub(
-            repo_id=MODEL_ID,
-            model_kwargs={"temperature": 0.1, "max_length": 512}
         )
-    except Exception as e:
-        return f"Error initializing LLM (HuggingFaceHub). Ensure API key is set and valid: {e}"
-    # 3. Create LangChain Pandas Agent
-    try:
         agent = create_pandas_dataframe_agent(
             llm,
             df,
             verbose=True,
-            agent_type=AgentType.OPENAI_FUNCTIONS, # Using OPENAI_FUNCTIONS for structured response, compatible with HuggingFaceHub and best for structured reasoning
-            handle_parsing_errors=True
         )
         # 4. Run the query
-        response = agent.run(user_query)
         return response
     except Exception as e:
-        return f"An error occurred during agent execution. The model might not have enough context or the query caused a parsing issue. Error: {e}"

 import pandas as pd
 import csv
+import io
+# Changed LLM import to HuggingFaceEndpoint
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.agent_toolkits import create_pandas_dataframe_agent
+from dotenv import load_dotenv
+import os
+# Load environment variables from .env file
+load_dotenv()
+# --- Hugging Face Model Configuration ---
+HF_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"
+def detect_delimiter(file_content: bytes) -> str:
+    """Detects the delimiter of a CSV file content."""
     try:
+        # Decode the first few lines to sample the content
+        sample = file_content.decode('utf-8').splitlines()[:5]
+        if not sample:
+            return ',' # Default to comma if empty
+        # Use csv.Sniffer to guess the dialect (and thus the delimiter)
+        dialect = csv.Sniffer().sniff('\n'.join(sample))
         return dialect.delimiter
+    except Exception:
+        # Fallback to a comma if sniffing fails
         return ','
+def query_agent(uploaded_file_content: bytes, query: str, hf_api_token: str) -> str:
     """
+    Initializes a LangChain Pandas Agent and processes a natural language query using a Hugging Face LLM.
+    Args:
+        uploaded_file_content: The byte content of the uploaded CSV file.
+        query: The natural language question from the user.
+        hf_api_token: The API token for the Hugging Face Hub.
+    Returns:
+        The response generated by the agent.
     """
+    if not hf_api_token:
+        # Updated error message for Hugging Face
+        return "Error: HUGGINGFACEHUB_API_TOKEN is not configured."
     try:
+        # 1. Robustly read CSV content using detected delimiter
+        delimiter = detect_delimiter(uploaded_file_content)
+        data_io = io.StringIO(uploaded_file_content.decode('utf-8'))
+        df = pd.read_csv(data_io, sep=delimiter)
+        # 2. Initialize the LLM using HuggingFaceEndpoint
+        llm = HuggingFaceEndpoint(
+            repo_id=HF_REPO_ID,
+            huggingfacehub_api_token=hf_api_token,
+            temperature=0.0, # Keep reasoning deterministic
+            max_new_tokens=512
         )
+        # 3. Create the Pandas DataFrame Agent
         agent = create_pandas_dataframe_agent(
             llm,
             df,
             verbose=True,
+            # Include a system prompt to guide the agent's behavior
+            agent_kwargs={
+                "system_message": (
+                    "You are an expert data analysis assistant. You are interacting with a pandas DataFrame "
+                    "named 'df'. Use Python code only to answer questions about the data. "
+                    "Do not make up facts. Always show the code you executed before giving the final answer."
+                )
+            }
         )
         # 4. Run the query
+        response = agent.run(query)
         return response
     except Exception as e:
+        # Catch and report any exceptions during processing
+        return f"An error occurred during analysis: {e}"