Sw1ft0 commited on
Commit
7f64dba
·
1 Parent(s): 288f777

Add special header case handling to data loading section. Append schema with example rows to be passed to Gemini for context. Improve the prompt and format_result function.

Browse files
Files changed (1) hide show
  1. app.py +48 -13
app.py CHANGED
@@ -9,18 +9,42 @@ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
9
  model = genai.GenerativeModel("gemini-2.5-pro")
10
 
11
  # 2. Load Excel data
12
- df1 = pd.read_excel(os.path.join("data_source", "OC Onboarding Information.xlsx"), sheet_name=None)
13
  df2 = pd.read_excel(os.path.join("data_source", "The Alex Ideas Report.xlsx"), sheet_name=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Build schema info for prompts
16
  def get_schema_info():
17
- schema1_lines = ["Report 1 - OC Onboarding Information", "Sheet Name: [Sheet Columns]"]
18
  for sheet, df in df1.items():
19
- schema1_lines.append(f"{sheet}: {list(df.columns)}")
20
- schema2_lines = ["Report 2 - The Alex Ideas Report", "Sheet Name: [Sheet Columns]"]
 
 
21
  for sheet, df in df2.items():
22
- schema2_lines.append(f"{sheet}: {list(df.columns)}")
23
- return "\n".join(schema1_lines + schema2_lines)
 
 
24
 
25
  schema_info = get_schema_info()
26
 
@@ -28,14 +52,20 @@ schema_info = get_schema_info()
28
  def format_result(result):
29
  # Convert numpy scalars
30
  if isinstance(result, np.generic):
31
- return result.item()
 
 
32
  # Convert dicts into readable strings
33
  if isinstance(result, dict):
34
  return "\n".join([f"{k}: {format_result(v)}" for k, v in result.items()])
35
  # Convert lists into comma-separated string
36
  if isinstance(result, list):
37
  return ", ".join(map(str, result))
38
- return result
 
 
 
 
39
 
40
  # 3. Core function
41
  def answer_question(history, message):
@@ -46,11 +76,13 @@ def answer_question(history, message):
46
  prompt = f"""
47
  You are a data analysis assistant.
48
  You can ONLY answer questions using the two Excel reports provided (df1 and df2).
49
- The reports are loaded as Python dictionaries of DataFrames:
50
- - df1['SheetName']['ColumnName'] for Report 1
51
- - df2['SheetName']['ColumnName'] for Report 2
52
  Do not hallucinate or use external knowledge.
53
- If the question is irrelevant, respond with:
 
 
 
 
 
54
  "I can only answer questions about the provided Excel reports."
55
 
56
  The reports have the following schema:
@@ -64,7 +96,10 @@ Rules:
64
  - Do NOT write import statements (pandas is already imported as pd).
65
  - Always put the answer in a variable named `result`.
66
  - Return ONLY Python code, nothing else.
67
- - If multiple values are tied for the maximum, include all of them.
 
 
 
68
  """
69
 
70
  try:
 
9
  model = genai.GenerativeModel("gemini-2.5-pro")
10
 
11
  # 2. Load Excel data
 
12
  df2 = pd.read_excel(os.path.join("data_source", "The Alex Ideas Report.xlsx"), sheet_name=None)
13
+ # Load all sheets, handle special header case
14
+ df1_all = pd.read_excel(
15
+ os.path.join("data_source", "OC Onboarding Information.xlsx"),
16
+ sheet_name=None,
17
+ header=None # load raw to inspect
18
+ )
19
+
20
+ df1 = {}
21
+ for sheet, raw_df in df1_all.items():
22
+ if sheet == "PY Event Diary":
23
+ # use row 2 as header
24
+ df1[sheet] = pd.read_excel(
25
+ os.path.join("data_source", "OC Onboarding Information.xlsx"),
26
+ sheet_name=sheet,
27
+ header=1
28
+ )
29
+ else:
30
+ df1[sheet] = pd.read_excel(
31
+ os.path.join("data_source", "OC Onboarding Information.xlsx"),
32
+ sheet_name=sheet
33
+ )
34
 
35
  # Build schema info for prompts
36
  def get_schema_info():
37
+ lines = ["Report 1 - OC Onboarding Information:"]
38
  for sheet, df in df1.items():
39
+ lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
40
+ sample = df.head(1).to_dict(orient="records")[0]
41
+ lines.append(f"Example row: {sample}")
42
+ lines.append("\nReport 2 - The Alex Ideas Report:")
43
  for sheet, df in df2.items():
44
+ lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
45
+ sample = df.head(1).to_dict(orient="records")[0]
46
+ lines.append(f"Example row: {sample}")
47
+ return "\n".join(lines)
48
 
49
  schema_info = get_schema_info()
50
 
 
52
  def format_result(result):
53
  # Convert numpy scalars
54
  if isinstance(result, np.generic):
55
+ return round(result.item(), 2)
56
+ if isinstance(result, (int, float)):
57
+ return round(result, 2)
58
  # Convert dicts into readable strings
59
  if isinstance(result, dict):
60
  return "\n".join([f"{k}: {format_result(v)}" for k, v in result.items()])
61
  # Convert lists into comma-separated string
62
  if isinstance(result, list):
63
  return ", ".join(map(str, result))
64
+ if isinstance(result, pd.Series):
65
+ return result.to_string()
66
+ if isinstance(result, pd.DataFrame):
67
+ return result.head().to_string(index=False)
68
+ return str(result)
69
 
70
  # 3. Core function
71
  def answer_question(history, message):
 
76
  prompt = f"""
77
  You are a data analysis assistant.
78
  You can ONLY answer questions using the two Excel reports provided (df1 and df2).
 
 
 
79
  Do not hallucinate or use external knowledge.
80
+ The reports are loaded as dictionaries of DataFrames:
81
+ - Access Report 1 with df1['SheetName']
82
+ - Access Report 2 with df2['SheetName']
83
+ Do not reload Excel files with pandas.
84
+ If unsure is the question relevant, try to reason using columns available.
85
+ If absolutely no relation to provided sheets, respond with:
86
  "I can only answer questions about the provided Excel reports."
87
 
88
  The reports have the following schema:
 
96
  - Do NOT write import statements (pandas is already imported as pd).
97
  - Always put the answer in a variable named `result`.
98
  - Return ONLY Python code, nothing else.
99
+ - If multiple values are tied for the maximum, include all of them in a list.
100
+ - If result is numeric, round to 2 decimal places.
101
+ - If result is a list, return the full list (not just the first element).
102
+ - If a column is missing, return a clear error string in `result`, do not crash.
103
  """
104
 
105
  try: