yonkoyonks commited on
Commit
c23a150
·
verified ·
1 Parent(s): a3796ba

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +61 -61
utils.py CHANGED
@@ -1,62 +1,62 @@
1
- from langchain_community.llms import LlamaCpp
2
- from dotenv import load_dotenv
3
- import os
4
- import pandas as pd
5
-
6
- # Load environment variables
7
- load_dotenv()
8
- MODEL_PATH = os.getenv("MODEL_PATH", "./models/gemma-2b-it.Q2_K.gguf")
9
-
10
- def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
11
- """Summarize a dataframe to avoid overloading the context window."""
12
- summary = f"Columns: {', '.join(df.columns)}\n\n"
13
- if len(df) > max_rows:
14
- sample = df.sample(max_rows, random_state=42)
15
- summary += "Showing a random sample of rows:\n"
16
- else:
17
- sample = df
18
- summary += "Showing all rows:\n"
19
- summary += sample.to_string(index=False)
20
- return summary
21
-
22
- def query_agent(df: pd.DataFrame, query: str) -> str:
23
- """Query a CSV/DataFrame using your local Gemma model with context-aware limits."""
24
- # Attempt to handle simple analytical questions directly with pandas
25
- query_lower = query.lower()
26
- try:
27
- if "most common" in query_lower or "most frequent" in query_lower:
28
- for col in df.columns:
29
- if col.lower() in query_lower:
30
- value = df[col].mode()[0]
31
- return f"The most common value in column '{col}' is '{value}'."
32
- except Exception as e:
33
- print("Direct analysis failed:", e)
34
-
35
- # Otherwise summarize dataset for LLM
36
- data_text = summarize_dataframe(df)
37
-
38
- prompt = f"""
39
- You are a data analysis assistant with expertise in statistics and data interpretation.
40
-
41
- Analyze the dataset sample below and answer the user's question in a **clear, detailed, and well-explained way**.
42
- Include both the **direct answer** and a short **explanation or reasoning** behind it.
43
-
44
- Dataset Summary:
45
- {data_text}
46
-
47
- Question:
48
- {query}
49
-
50
- Answer (with explanation):
51
- """
52
-
53
- llm = LlamaCpp(
54
- model_path=MODEL_PATH,
55
- temperature=0.7,
56
- max_new_tokens=1024,
57
- n_ctx=16384,
58
- verbose=True,
59
- )
60
-
61
- answer = llm(prompt)
62
  return answer
 
1
+ from langchain_community.llms import LlamaCpp
2
+ from dotenv import load_dotenv
3
+ import os
4
+ import pandas as pd
5
+
6
+ # Load environment variables
7
+ load_dotenv()
8
+ MODEL_PATH = os.getenv("MODEL_PATH", "TheBloke/gemma-2b-it-GGUF/gemma-2b-it.Q4_K_M.gguf")
9
+
10
+ def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
11
+ """Summarize a dataframe to avoid overloading the context window."""
12
+ summary = f"Columns: {', '.join(df.columns)}\n\n"
13
+ if len(df) > max_rows:
14
+ sample = df.sample(max_rows, random_state=42)
15
+ summary += "Showing a random sample of rows:\n"
16
+ else:
17
+ sample = df
18
+ summary += "Showing all rows:\n"
19
+ summary += sample.to_string(index=False)
20
+ return summary
21
+
22
+ def query_agent(df: pd.DataFrame, query: str) -> str:
23
+ """Query a CSV/DataFrame using your local Gemma model with context-aware limits."""
24
+ # Attempt to handle simple analytical questions directly with pandas
25
+ query_lower = query.lower()
26
+ try:
27
+ if "most common" in query_lower or "most frequent" in query_lower:
28
+ for col in df.columns:
29
+ if col.lower() in query_lower:
30
+ value = df[col].mode()[0]
31
+ return f"The most common value in column '{col}' is '{value}'."
32
+ except Exception as e:
33
+ print("Direct analysis failed:", e)
34
+
35
+ # Otherwise summarize dataset for LLM
36
+ data_text = summarize_dataframe(df)
37
+
38
+ prompt = f"""
39
+ You are a data analysis assistant with expertise in statistics and data interpretation.
40
+
41
+ Analyze the dataset sample below and answer the user's question in a **clear, detailed, and well-explained way**.
42
+ Include both the **direct answer** and a short **explanation or reasoning** behind it.
43
+
44
+ Dataset Summary:
45
+ {data_text}
46
+
47
+ Question:
48
+ {query}
49
+
50
+ Answer (with explanation):
51
+ """
52
+
53
+ llm = LlamaCpp(
54
+ model_path=MODEL_PATH,
55
+ temperature=0.7,
56
+ max_new_tokens=1024,
57
+ n_ctx=16384,
58
+ verbose=True,
59
+ )
60
+
61
+ answer = llm(prompt)
62
  return answer