Spaces:

k-pavlo
/

excel-ai-analyzer

Sleeping

App Files Files Community

Sw1ft0 commited on Sep 13

Commit

7f64dba

1 Parent(s): 288f777

Add special header case handling to data loading section. Append schema with example rows to be passed to Gemini for context. Improve the prompt and format_result function.

Browse files

Files changed (1) hide show

app.py +48 -13

app.py CHANGED Viewed

@@ -9,18 +9,42 @@ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 model = genai.GenerativeModel("gemini-2.5-pro")
 # 2. Load Excel data
-df1 = pd.read_excel(os.path.join("data_source", "OC Onboarding Information.xlsx"), sheet_name=None)
 df2 = pd.read_excel(os.path.join("data_source", "The Alex Ideas Report.xlsx"), sheet_name=None)
 # Build schema info for prompts
 def get_schema_info():
-    schema1_lines = ["Report 1 - OC Onboarding Information", "Sheet Name: [Sheet Columns]"]
     for sheet, df in df1.items():
-        schema1_lines.append(f"{sheet}: {list(df.columns)}")
-    schema2_lines = ["Report 2 - The Alex Ideas Report", "Sheet Name: [Sheet Columns]"]
     for sheet, df in df2.items():
-        schema2_lines.append(f"{sheet}: {list(df.columns)}")
-    return "\n".join(schema1_lines + schema2_lines)
 schema_info = get_schema_info()
@@ -28,14 +52,20 @@ schema_info = get_schema_info()
 def format_result(result):
     # Convert numpy scalars
     if isinstance(result, np.generic):
-        return result.item()
     # Convert dicts into readable strings
     if isinstance(result, dict):
         return "\n".join([f"{k}: {format_result(v)}" for k, v in result.items()])
     # Convert lists into comma-separated string
     if isinstance(result, list):
         return ", ".join(map(str, result))
-    return result
 # 3. Core function
 def answer_question(history, message):
@@ -46,11 +76,13 @@ def answer_question(history, message):
     prompt = f"""
 You are a data analysis assistant.
 You can ONLY answer questions using the two Excel reports provided (df1 and df2).
-The reports are loaded as Python dictionaries of DataFrames:
-- df1['SheetName']['ColumnName'] for Report 1
-- df2['SheetName']['ColumnName'] for Report 2
 Do not hallucinate or use external knowledge.
-If the question is irrelevant, respond with:
 "I can only answer questions about the provided Excel reports."
 The reports have the following schema:
@@ -64,7 +96,10 @@ Rules:
 - Do NOT write import statements (pandas is already imported as pd).
 - Always put the answer in a variable named `result`.
 - Return ONLY Python code, nothing else.
-- If multiple values are tied for the maximum, include all of them.
     """
     try:

 model = genai.GenerativeModel("gemini-2.5-pro")
 # 2. Load Excel data
 df2 = pd.read_excel(os.path.join("data_source", "The Alex Ideas Report.xlsx"), sheet_name=None)
+# Load all sheets, handle special header case
+df1_all = pd.read_excel(
+    os.path.join("data_source", "OC Onboarding Information.xlsx"),
+    sheet_name=None,
+    header=None  # load raw to inspect
+)
+df1 = {}
+for sheet, raw_df in df1_all.items():
+    if sheet == "PY Event Diary":
+        # use row 2 as header
+        df1[sheet] = pd.read_excel(
+            os.path.join("data_source", "OC Onboarding Information.xlsx"),
+            sheet_name=sheet,
+            header=1
+        )
+    else:
+        df1[sheet] = pd.read_excel(
+            os.path.join("data_source", "OC Onboarding Information.xlsx"),
+            sheet_name=sheet
+        )
 # Build schema info for prompts
 def get_schema_info():
+    lines = ["Report 1 - OC Onboarding Information:"]
     for sheet, df in df1.items():
+        lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
+        sample = df.head(1).to_dict(orient="records")[0]
+        lines.append(f"Example row: {sample}")
+    lines.append("\nReport 2 - The Alex Ideas Report:")
     for sheet, df in df2.items():
+        lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
+        sample = df.head(1).to_dict(orient="records")[0]
+        lines.append(f"Example row: {sample}")
+    return "\n".join(lines)
 schema_info = get_schema_info()
 def format_result(result):
     # Convert numpy scalars
     if isinstance(result, np.generic):
+        return round(result.item(), 2)
+    if isinstance(result, (int, float)):
+        return round(result, 2)
     # Convert dicts into readable strings
     if isinstance(result, dict):
         return "\n".join([f"{k}: {format_result(v)}" for k, v in result.items()])
     # Convert lists into comma-separated string
     if isinstance(result, list):
         return ", ".join(map(str, result))
+    if isinstance(result, pd.Series):
+        return result.to_string()
+    if isinstance(result, pd.DataFrame):
+        return result.head().to_string(index=False)
+    return str(result)
 # 3. Core function
 def answer_question(history, message):
     prompt = f"""
 You are a data analysis assistant.
 You can ONLY answer questions using the two Excel reports provided (df1 and df2).
 Do not hallucinate or use external knowledge.
+The reports are loaded as dictionaries of DataFrames:
+- Access Report 1 with df1['SheetName']
+- Access Report 2 with df2['SheetName']
+Do not reload Excel files with pandas.
+If unsure is the question relevant, try to reason using columns available.
+If absolutely no relation to provided sheets, respond with:
 "I can only answer questions about the provided Excel reports."
 The reports have the following schema:
 - Do NOT write import statements (pandas is already imported as pd).
 - Always put the answer in a variable named `result`.
 - Return ONLY Python code, nothing else.
+- If multiple values are tied for the maximum, include all of them in a list.
+- If result is numeric, round to 2 decimal places.
+- If result is a list, return the full list (not just the first element).
+- If a column is missing, return a clear error string in `result`, do not crash.
     """
     try: