Documents-Manager

Sleeping

App Files Files Community

rairo commited on Jul 20, 2025

Commit

b07357c

verified ·

1 Parent(s): 0098187

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -167

app.py CHANGED Viewed

@@ -22,7 +22,6 @@ def configure_gemini(api_key):
     """
     st.info("Configuring Gemini API for transaction extraction...")
     genai.configure(api_key=api_key)
-    # Using the model specified by the user for this task
     return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def configure_gemini1(api_key):
@@ -31,7 +30,6 @@ def configure_gemini1(api_key):
     """
     st.info("Configuring Gemini API for report generation...")
     genai.configure(api_key=api_key)
-    # Using the state-of-the-art model for high-quality report formatting
     return genai.GenerativeModel('gemini-2.5-pro')
 def read_pdf_pages(file_obj):
@@ -77,12 +75,10 @@ def process_with_gemini(model, text):
     }"""
     try:
         response = model.generate_content([prompt, text])
-        time.sleep(6) # Retaining original sleep time as per user's working code
         return response.text
     except exceptions.GoogleAPICallError as e:
         st.error(f"A Google API call error occurred during transaction extraction: {e}")
-        if "context length" in str(e):
-            st.warning("The text on a single PDF page may be too long for the extraction model.")
         return None
     except Exception as e:
         st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}")
@@ -90,134 +86,81 @@ def process_with_gemini(model, text):
 def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
     all_transactions = []
-    st.info(f"Starting page-by-page PDF processing for {total_pages} pages...")
     for page_num in range(total_pages):
         if progress_callback:
             progress_callback(page_num / total_pages, f"Processing page {page_num + 1} of {total_pages}")
         page_text = extract_page_text(pdf_reader, page_num)
         if not page_text.strip():
             continue
-        st.info(f"Sending page {page_num + 1} text to Gemini for transaction extraction...")
         json_response = process_with_gemini(model, page_text)
         if json_response:
-            # A more robust regex to find the JSON block
             match = re.search(r'\{.*\}', json_response, re.DOTALL)
             if not match:
-                st.warning(f"No valid JSON object found in Gemini response for page {page_num + 1}.")
                 continue
             json_str = match.group(0)
             try:
                 data = json.loads(json_str)
                 transactions = data.get('transactions', [])
                 if transactions:
-                    st.info(f"Successfully extracted {len(transactions)} transactions from page {page_num + 1}.")
                     all_transactions.extend(transactions)
             except json.JSONDecodeError:
-                st.error(f"Failed to decode JSON from Gemini response for page {page_num + 1}.")
                 continue
-        else:
-            st.warning(f"Gemini returned no response for page {page_num + 1}.")
-    st.info(f"Finished processing all pages. Total transactions extracted: {len(all_transactions)}.")
     return all_transactions
 def aggregate_financial_data(transactions: list, statement_type: str):
-    """
-    Aggregates transaction data using Pandas for high performance and accuracy.
-    This function does the heavy lifting locally, preparing a small summary for the LLM.
-    This version includes robust cleaning of the 'Amount' column.
-    """
     st.info(f"Performing local financial aggregation for {len(transactions)} transactions...")
     if not transactions:
-        st.warning("No transactions to aggregate.")
         return None
     df = pd.DataFrame(transactions)
-    # --- Robust Data Cleaning and Preparation ---
     if 'Amount' not in df.columns:
-        st.error("'Amount' column not found in the transaction data. Cannot perform aggregation.")
         return None
-    # 1. Ensure the 'Amount' column is treated as a string to use string operations.
-    df['Amount'] = df['Amount'].astype(str)
-    # 2. Use a regular expression to remove any character that is NOT a digit or a decimal point.
-    # This handles currency symbols, commas, spaces, etc.
-    df['Amount'] = df['Amount'].str.replace(r'[^\d.]', '', regex=True)
-    # 3. Now, it's safe to convert the cleaned string to a numeric type.
-    # Coerce errors will handle any empty strings that might result from the cleaning.
     df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce').fillna(0)
-    # 4. Ensure 'Type' column is standardized to lowercase for consistent filtering.
     df['Type'] = df['Type'].str.lower()
-    # --- Core Financial Calculations ---
     total_income = df[df['Type'] == 'income']['Amount'].sum()
     total_expenses = df[df['Type'] == 'expense']['Amount'].sum()
     net_position = total_income - total_expenses
-    # --- Build the Aggregated Data Structure ---
     aggregated_data = {
         "total_income": round(total_income, 2),
         "total_expenses": round(total_expenses, 2),
         "net_position": round(net_position, 2),
         "transaction_count": len(df)
     }
-    # --- Statement-Specific Aggregations ---
     if statement_type == "Income Statement":
-        expense_breakdown = df[df['Type'] == 'expense'].groupby('Category_of_expense')['Amount'].sum().round(2).to_dict()
-        aggregated_data["expense_breakdown"] = expense_breakdown
-        income_breakdown = df[df['Type'] == 'income'].groupby('Customer_name')['Amount'].sum().round(2).to_dict()
-        aggregated_data["income_breakdown"] = income_breakdown
-    elif statement_type == "Cashflow Statement":
-        aggregated_data["operating_cash_flow"] = round(net_position, 2)
-        aggregated_data["cash_inflows"] = round(total_income, 2)
-        aggregated_data["cash_outflows"] = round(total_expenses, 2)
-    elif statement_type == "Balance Sheet":
-        aggregated_data["notes"] = "Balance Sheets require asset and liability balances, not just transaction flows. This data can only show the net change in cash over the period."
     st.success("Local financial aggregation complete.")
     return aggregated_data
 def generate_financial_report(model, aggregated_data, start_date, end_date, statement_type):
     """
-    Generates a financial report by sending a small, pre-aggregated summary to the LLM.
-    The LLM's job is to format this data professionally, not to calculate it.
     """
     st.info(f"Preparing to generate {statement_type} with pre-aggregated data...")
-    prompt = f"""
-Based on the following pre-aggregated financial summary JSON data:
-{json.dumps(aggregated_data, indent=2)}
-Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
-Specific Formatting and Content Requirements:
-Standard Accounting Structure (South Africa Focus): Organize the {statement_type} according to typical accounting practices followed in South Africa (e.g., for an Income Statement, clearly separate Revenue, Cost of Goods Sold, Gross Profit, Operating Expenses, and Net Income, in nice tables considering local terminology where applicable). If unsure of specific local variations, adhere to widely accepted international accounting structures.
-Clear Headings and Subheadings: Use distinct and informative headings and subheadings in English to delineate different sections of the report. Ensure these are visually prominent.
-Consistent Formatting: Maintain consistent formatting for monetary values (using "R" for South African Rand), dates, and alignment.
-Totals and Subtotals: Clearly display totals for relevant categories and subtotals where appropriate.
-Descriptive Line Items: Use the provided aggregated data to create clear line items.
-Key Insights: Include a brief section (e.g., "Key Highlights" or "Summary") that identifies significant trends or key performance indicators derived from the provided summary data.
-Concise Summary: Provide a concluding summary paragraph that encapsulates the overall financial picture.
-Special Case for Balance Sheet: If the request is for a "Balance Sheet," explain professionally that a balance sheet cannot be generated from transaction data alone, as it requires a snapshot of assets, liabilities, and equity. Then, present the available cash flow information as a helpful alternative.
-Format the entire report in Markdown for better visual structure.
-Do not name the company if a name is not there; refer to it as "The Business". Return just the report and nothing else.
 """
     try:
         st.info("Sending request to Gemini for final report formatting...")
         response = model.generate_content([prompt])
-        time.sleep(7) # Retaining original sleep time
         st.success("Successfully received formatted financial report from Gemini.")
         return response.text
     except exceptions.GoogleAPICallError as e:
@@ -227,78 +170,11 @@ Do not name the company if a name is not there; refer to it as "The Business". R
         st.error(f"An unexpected error occurred during Gemini report generation: {e}")
         return None
-# --- PDF Generation Logic (Unaltered as per your request) ---
-class PDF_Generator(FPDF):
-    def add_html_element(self, tag, styles):
-        text = tag.get_text()
-        tag_name = tag.name.lower()
-        current_style = ''
-        if 'b' in styles or 'strong' in styles: current_style += 'B'
-        if 'i' in styles or 'em' in styles: current_style += 'I'
-        if not current_style: self.set_font('helvetica', '', self.font_size_pt)
-        if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-            level = int(tag_name[1])
-            font_size = {1: 18, 2: 16, 3: 14, 4: 12, 5: 11, 6: 10}.get(level, 10)
-            self.set_font('helvetica', 'B', font_size)
-            self.multi_cell(0, font_size * 0.5, text, align='L')
-            self.ln(font_size * 0.3)
-            self.set_font('helvetica', '', 10)
-        elif tag_name == 'p':
-            self.set_font('helvetica', current_style, 10)
-            self.multi_cell(0, 5, text, align='L')
-            self.ln(3)
-        elif tag_name == 'ul':
-            self.ln(2)
-            for item in tag.find_all('li', recursive=False):
-                self.set_font('helvetica', '', 10)
-                item_text = item.get_text()
-                self.cell(5, 5, chr(127))
-                self.multi_cell(0, 5, item_text, align='L')
-                self.ln(1)
-            self.ln(3)
-        elif tag_name == 'table':
-            self.ln(5)
-            self.process_table(tag)
-            self.ln(5)
-        elif tag_name == 'br': self.ln(5)
-        elif tag_name == 'hr':
-            self.ln(2)
-            self.line(self.get_x(), self.get_y(), self.w - self.r_margin, self.get_y())
-            self.ln(4)
-        else:
-            if text.strip():
-                self.set_font('helvetica', current_style, 10)
-                self.multi_cell(0, 5, text, align='L')
-                self.ln(1)
-    def process_table(self, table_tag):
-        rows = table_tag.find_all('tr')
-        if not rows: return
-        header_cells = rows[0].find_all(['th', 'td'])
-        num_cols = len(header_cells)
-        if num_cols == 0: return
-        effective_width = self.w - self.l_margin - self.r_margin
-        col_width = effective_width / num_cols
-        default_cell_height = 6
-        is_first_row = True
-        for row in rows:
-            cells = row.find_all(['th', 'td'])
-            if len(cells) != num_cols: continue
-            is_header_row = all(c.name == 'th' for c in cells) or (is_first_row and any(c.name == 'th' for c in cells))
-            for i, cell in enumerate(cells):
-                cell_text = cell.get_text().strip()
-                if is_header_row:
-                    self.set_font('helvetica', 'B', 9)
-                    self.set_fill_color(230, 230, 230)
-                    fill = True
-                else:
-                    self.set_font('helvetica', '', 9)
-                    fill = False
-                self.multi_cell(col_width, default_cell_height, cell_text, border=1, align='L', fill=fill, new_x="RIGHT", new_y="TOP")
-            self.ln(default_cell_height)
-            is_first_row = False
 def create_pdf_report(report_text):
     if not report_text:
         st.warning("Report text is empty, skipping PDF generation.")
         raise ValueError("Input report_text cannot be empty.")
@@ -307,16 +183,69 @@ def create_pdf_report(report_text):
         cleaned_md = re.sub(r'```markdown|```', '', report_text, flags=re.MULTILINE).strip()
         html_content = markdown.markdown(cleaned_md, extensions=['tables'])
         soup = BeautifulSoup(html_content, 'html.parser')
-        pdf = PDF_Generator()
         pdf.set_auto_page_break(auto=True, margin=15)
         pdf.set_left_margin(15)
         pdf.set_right_margin(15)
         pdf.add_page()
-        pdf.set_font('helvetica', '', 10)
-        for element in soup.find_all(recursive=False):
-            pdf.add_html_element(element, set())
         st.info("Content added to PDF. Outputting PDF to buffer...")
-        pdf_output = pdf.output(dest='S').encode('latin-1')
         st.success("PDF report generated successfully.")
         return BytesIO(pdf_output)
     except Exception as e:
@@ -340,7 +269,6 @@ def main():
     if input_type == "Bulk Bank Statement Upload":
         uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
         if uploaded_files:
-            st.info(f"User uploaded {len(uploaded_files)} PDF file(s).")
             model = configure_gemini(api_key)
             progress_bar = st.progress(0)
             all_transactions = []
@@ -357,26 +285,20 @@ def main():
     elif input_type == "CSV Upload":
         uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
         if uploaded_csv:
-            st.info(f"User uploaded CSV file: {uploaded_csv.name}.")
             df = pd.read_csv(uploaded_csv)
             df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
             st.session_state['transactions'] = df.to_dict(orient='records')
             st.success(f"Successfully loaded {len(st.session_state['transactions'])} transactions from CSV.")
     if st.session_state['transactions']:
-        st.info("Consolidating and displaying all extracted transactions.")
         df = pd.DataFrame(st.session_state['transactions'])
         df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)
         df.dropna(subset=['Date'], inplace=True)
         if not df.empty:
-            min_date = df['Date'].min().date()
-            max_date = df['Date'].max().date()
-            st.session_state['min_date'] = min_date
-            st.session_state['max_date'] = max_date
         st.write("### Extracted Transactions")
         st.dataframe(df.astype(str))
-    else:
-        st.info("No transactions loaded yet. Upload files to begin.")
     st.write("### Generate Financial Report")
     col1, col2 = st.columns(2)
@@ -384,7 +306,7 @@ def main():
         start_date = st.date_input("Start Date", st.session_state['min_date'])
     with col2:
         end_date = st.date_input("End Date", st.session_state['max_date'])
-    statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
     if st.button("Generate Financial Report"):
         if not st.session_state['transactions']:
@@ -418,8 +340,6 @@ def main():
                                 file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
                                 mime="application/pdf"
                             )
-                        else:
-                            st.error("Failed to generate the financial report from the aggregated data.")
                 except Exception as e:
                     st.error(f"An unexpected error occurred during the report generation process: {e}")
                     st.exception(e)

     """
     st.info("Configuring Gemini API for transaction extraction...")
     genai.configure(api_key=api_key)
     return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def configure_gemini1(api_key):
     """
     st.info("Configuring Gemini API for report generation...")
     genai.configure(api_key=api_key)
     return genai.GenerativeModel('gemini-2.5-pro')
 def read_pdf_pages(file_obj):
     }"""
     try:
         response = model.generate_content([prompt, text])
+        time.sleep(6)
         return response.text
     except exceptions.GoogleAPICallError as e:
         st.error(f"A Google API call error occurred during transaction extraction: {e}")
         return None
     except Exception as e:
         st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}")
 def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
     all_transactions = []
     for page_num in range(total_pages):
         if progress_callback:
             progress_callback(page_num / total_pages, f"Processing page {page_num + 1} of {total_pages}")
         page_text = extract_page_text(pdf_reader, page_num)
         if not page_text.strip():
             continue
         json_response = process_with_gemini(model, page_text)
         if json_response:
             match = re.search(r'\{.*\}', json_response, re.DOTALL)
             if not match:
                 continue
             json_str = match.group(0)
             try:
                 data = json.loads(json_str)
                 transactions = data.get('transactions', [])
                 if transactions:
                     all_transactions.extend(transactions)
             except json.JSONDecodeError:
                 continue
     return all_transactions
 def aggregate_financial_data(transactions: list, statement_type: str):
     st.info(f"Performing local financial aggregation for {len(transactions)} transactions...")
     if not transactions:
         return None
     df = pd.DataFrame(transactions)
     if 'Amount' not in df.columns:
         return None
+    df['Amount'] = df['Amount'].astype(str).str.replace(r'[^\d.]', '', regex=True)
     df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce').fillna(0)
     df['Type'] = df['Type'].str.lower()
     total_income = df[df['Type'] == 'income']['Amount'].sum()
     total_expenses = df[df['Type'] == 'expense']['Amount'].sum()
     net_position = total_income - total_expenses
     aggregated_data = {
         "total_income": round(total_income, 2),
         "total_expenses": round(total_expenses, 2),
         "net_position": round(net_position, 2),
         "transaction_count": len(df)
     }
     if statement_type == "Income Statement":
+        aggregated_data["expense_breakdown"] = df[df['Type'] == 'expense'].groupby('Category_of_expense')['Amount'].sum().round(2).to_dict()
+        aggregated_data["income_breakdown"] = df[df['Type'] == 'income'].groupby('Customer_name')['Amount'].sum().round(2).to_dict()
     st.success("Local financial aggregation complete.")
     return aggregated_data
 def generate_financial_report(model, aggregated_data, start_date, end_date, statement_type):
     """
+    Generates a financial report using a simplified, high-level prompt that
+    trusts the model to create the correct structure and avoids using any
+    Markdown characters that could break rendering.
     """
     st.info(f"Preparing to generate {statement_type} with pre-aggregated data...")
+    # This is the final, simplified, high-level prompt with no special characters.
+    prompt = f"""
+You are an expert financial analyst. Your task is to generate a professional Income Statement in Markdown format using the pre-aggregated JSON data provided below.
+JSON Data:
+{json.dumps(aggregated_data, indent=2)}
+Instructions:
+Your response must be a complete financial report in Markdown.
+The main title of the report is "Income Statement".
+The reporting period is from {start_date.strftime('%d %B %Y')} to {end_date.strftime('%d %B %Y')}.
+The currency is South African Rand (ZAR).
+The report must contain sections for Revenue, Operating Expenses, and Net Income or Loss. Each of these sections must be a clear table.
+The report must also include a "Key Highlights" section with bullet points and a final "Summary" paragraph.
+Use the provided JSON data for all financial figures.
+For the Net Income or Loss table, if the net position is negative, display the amount in parentheses.
+Separate the major sections with a horizontal rule.
 """
     try:
         st.info("Sending request to Gemini for final report formatting...")
         response = model.generate_content([prompt])
         st.success("Successfully received formatted financial report from Gemini.")
         return response.text
     except exceptions.GoogleAPICallError as e:
         st.error(f"An unexpected error occurred during Gemini report generation: {e}")
         return None
 def create_pdf_report(report_text):
+    """
+    Creates a PDF from markdown text. Includes the critical fix for the
+    'bytearray' object has no attribute 'encode' error.
+    """
     if not report_text:
         st.warning("Report text is empty, skipping PDF generation.")
         raise ValueError("Input report_text cannot be empty.")
         cleaned_md = re.sub(r'```markdown|```', '', report_text, flags=re.MULTILINE).strip()
         html_content = markdown.markdown(cleaned_md, extensions=['tables'])
         soup = BeautifulSoup(html_content, 'html.parser')
+        pdf = FPDF()
         pdf.set_auto_page_break(auto=True, margin=15)
         pdf.set_left_margin(15)
         pdf.set_right_margin(15)
         pdf.add_page()
+        for element in soup.find_all(True):
+            if element.name in ['h1', 'h2', 'h3']:
+                level = int(element.name[1])
+                font_size = {1: 16, 2: 14, 3: 12}.get(level)
+                pdf.set_font('helvetica', 'B', font_size)
+                pdf.multi_cell(0, 10, element.get_text().strip())
+                pdf.ln(level * 2)
+            elif element.name == 'p':
+                pdf.set_font('helvetica', '', 11)
+                pdf.multi_cell(0, 6, element.get_text().strip())
+                pdf.ln(4)
+            elif element.name == 'i':
+                 pdf.set_font('helvetica', 'I', 11)
+                 pdf.multi_cell(0, 6, element.get_text().strip())
+                 pdf.ln(4)
+            elif element.name == 'hr':
+                pdf.line(pdf.get_x(), pdf.get_y(), pdf.w - pdf.r_margin, pdf.get_y())
+                pdf.ln(5)
+            elif element.name == 'ul':
+                pdf.ln(2)
+                for li in element.find_all('li'):
+                    pdf.set_font('helvetica', '', 11)
+                    pdf.multi_cell(0, 5, f"  •  {li.get_text().strip()}")
+                    pdf.ln(1)
+                pdf.ln(4)
+            elif element.name == 'table':
+                header = [th.get_text().strip() for th in element.find_all('th')]
+                rows = [[td.get_text().strip() for td in tr.find_all('td')] for tr in element.find_all('tr')[1:]]
+                if header:
+                    pdf.set_font('helvetica', 'B', 10)
+                    pdf.set_fill_color(230, 230, 230)
+                    col_widths = [ (pdf.w - pdf.l_margin - pdf.r_margin) * 0.6, (pdf.w - pdf.l_margin - pdf.r_margin) * 0.4 ]
+                    for i, header_text in enumerate(header):
+                        pdf.cell(col_widths[i], 8, header_text, border=1, fill=True, align='C')
+                    pdf.ln()
+                pdf.set_font('helvetica', '', 10)
+                for row in rows:
+                    is_total_row = any('Total' in cell for cell in row)
+                    if is_total_row:
+                        pdf.set_font('helvetica', 'B', 10)
+                    pdf.cell(col_widths[0], 7, row[0], border=1)
+                    pdf.cell(col_widths[1], 7, row[1], border=1, align='R')
+                    pdf.ln()
+                    if is_total_row:
+                        pdf.set_font('helvetica', '', 10)
+                pdf.ln(6)
         st.info("Content added to PDF. Outputting PDF to buffer...")
+        # --- CRITICAL FIX FOR PDF GENERATION ---
+        pdf_output = pdf.output()
         st.success("PDF report generated successfully.")
         return BytesIO(pdf_output)
     except Exception as e:
     if input_type == "Bulk Bank Statement Upload":
         uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
         if uploaded_files:
             model = configure_gemini(api_key)
             progress_bar = st.progress(0)
             all_transactions = []
     elif input_type == "CSV Upload":
         uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
         if uploaded_csv:
             df = pd.read_csv(uploaded_csv)
             df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
             st.session_state['transactions'] = df.to_dict(orient='records')
             st.success(f"Successfully loaded {len(st.session_state['transactions'])} transactions from CSV.")
     if st.session_state['transactions']:
         df = pd.DataFrame(st.session_state['transactions'])
         df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)
         df.dropna(subset=['Date'], inplace=True)
         if not df.empty:
+            st.session_state['min_date'] = df['Date'].min().date()
+            st.session_state['max_date'] = df['Date'].max().date()
         st.write("### Extracted Transactions")
         st.dataframe(df.astype(str))
     st.write("### Generate Financial Report")
     col1, col2 = st.columns(2)
         start_date = st.date_input("Start Date", st.session_state['min_date'])
     with col2:
         end_date = st.date_input("End Date", st.session_state['max_date'])
+    statement_type = st.selectbox("Select Financial Statement", ["Income Statement"])
     if st.button("Generate Financial Report"):
         if not st.session_state['transactions']:
                                 file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
                                 mime="application/pdf"
                             )
                 except Exception as e:
                     st.error(f"An unexpected error occurred during the report generation process: {e}")
                     st.exception(e)