Spaces:

MLBench
/

RealState_OCR

Sleeping

App Files Files Community

mlbench123 commited on Oct 29, 2025

Commit

6ca3aa0

verified ·

1 Parent(s): b211b3c

Update app.py

Browse files

Files changed (1) hide show

app.py +257 -200

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import gradio as gr
 import tempfile
 import shutil
 from pathlib import Path
 """
 Real Estate Financial Model Pipeline
@@ -44,23 +46,66 @@ class RealEstateModelPipeline:
         except Exception as e:
             print(f"Error extracting {pdf_path}: {e}")
             return ""
     def extract_all_pdfs(self, pdf_directory: str) -> Dict[str, str]:
-        """Extract text from all PDFs in directory"""
         pdf_dir = Path(pdf_directory)
         extracted_texts = {}
         with open('output_file_3.txt', "w", encoding="utf-8") as f:
             for pdf_file in pdf_dir.glob("*.pdf"):
-                print(f"Extracting: {pdf_file.name}")
                 text = self.extract_pdf_text(str(pdf_file))
                 extracted_texts[pdf_file.stem] = text
-                # Write each PDF’s name and extracted text to file
                 f.write(f"=== {pdf_file.name} ===\n")
                 f.write(text)
                 f.write("\n\n" + "="*80 + "\n\n")
         self.extracted_data = extracted_texts
         return extracted_texts
@@ -85,198 +130,206 @@ class RealEstateModelPipeline:
         prompt = f"""You are a real estate financial analyst. Extract ALL numerical data from the following PDF texts and return it as a JSON object.
-    CRITICAL INSTRUCTIONS:
-    1. ONLY extract data that is EXPLICITLY stated in the PDFs - DO NOT estimate or make up values
-    2. For missing values, use null (not 0)
-    3. Pay close attention to the specific document names - each contains different information
-    4. Extract exact numbers as they appear in the documents
-    AVAILABLE DOCUMENTS:
-    {pdf_summary}
-    PDF CONTENTS:
-    """
         for name, text in pdf_texts.items():
             prompt += f"\n{'='*60}\n=== {name} ===\n{'='*60}\n{text}\n"
         prompt += """
-    EXTRACTION INSTRUCTIONS BY DOCUMENT:
-    FROM "Offering_Memorandum.pdf":
-    - Extract: Address (full address after "Address:")
-    - Extract: Property Type (after "Property Type:")
-    - Extract: Units (number after "Units:")
-    FROM "Operating_Expenses_Summary.pdf" (if present):
-    - Extract EXACT annual amounts for:
-    * Real Estate Taxes
-    * Insurance
-    * Utilities
-    * Repairs & Maint. (or Repairs & Maintenance)
-    * Management Fee
-    * Payroll
-    * Administrative (if listed)
-    * Professional Fees (if listed)
-    FROM "Sales_Comps.pdf":
-    - Extract all Price/SF values
-    - Calculate average_price_per_sf = average of all Price/SF values
-    - Count total number of comps
-    FROM "Rent_Comps.pdf" (if present):
-    - Extract all rent values (numbers before @ symbol)
-    - Calculate average_rent = average of all rent values
-    - Count total number of rent comps
-    FROM "Market_Report.pdf":
-    - Extract: Vacancy Rate (percentage)
-    - Extract: Rent Growth (YoY) (percentage)
-    FROM "Demographics_Overview.pdf":
-    - Extract: Population (3-mi) - the number
-    - Extract: Median HH Income - the dollar amount
-    - Extract: Transit Score - the number
-    REQUIRED JSON OUTPUT STRUCTURE:
-    {
-    "property_info": {
-        "address": "EXTRACT FROM Offering_Memorandum.pdf",
-        "property_type": "EXTRACT FROM Offering_Memorandum.pdf",
-        "units": EXTRACT_NUMBER_FROM_Offering_Memorandum.pdf,
-        "gross_sf": null,
-        "rentable_sf": null,
-        "retail_sf": null
-    },
-    "acquisition": {
-        "land_value": null,
-        "price": null,
-        "closing_costs": null
-    },
-    "construction": {
-        "construction_cost_per_gsf": null,
-        "construction_months": null
-    },
-    "soft_costs": {
-        "architecture_and_interior_cost": null,
-        "structural_engineering_cost": null,
-        "mep_engineering_cost": null,
-        "civil_engineering_cost": null,
-        "controlled_inspections_cost": null,
-        "surveying_cost": null,
-        "utilities_connection_cost": null,
-        "advertising_and_marketing_cost": null,
-        "accounting_cost": null,
-        "monitoring_cost": null,
-        "ff_and_e_cost": null,
-        "environmental_consultant_fee": null,
-        "miscellaneous_consultants_fee": null,
-        "general_legal_cost": null,
-        "real_estate_taxes_during_construction": null,
-        "miscellaneous_admin_cost": null,
-        "ibr_cost": null,
-        "project_team_cost": null,
-        "pem_fees": null,
-        "bank_fees": null
-    },
-    "financing": {
-        "ltc_ratio": null,
-        "financing_percentage": null,
-        "interest_rate_basis_points": null,
-        "financing_cost": null,
-        "interest_reserve": null
-    },
-    "operating_expenses": {
-        "payroll": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
-        "repairs_and_maintenance": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
-        "utilities": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
-        "administrative": EXTRACT_FROM_Operating_Expenses_Summary.pdf_OR_null,
-        "professional_fees": EXTRACT_FROM_Operating_Expenses_Summary.pdf_OR_null,
-        "insurance": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
-        "property_taxes": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
-        "management_fee_percentage": null
-    },
-    "revenue": {
-        "free_market_rent_psf": null,
-        "affordable_rent_psf": null,
-        "other_income_per_unit": null,
-        "vacancy_rate": null,
-        "retail_rent_psf": null,
-        "parking_income": null
-    },
-    "sales_comps": {
-        "average_price_per_sf": CALCULATE_AVERAGE_FROM_Sales_Comps.pdf,
-        "comp_count": COUNT_FROM_Sales_Comps.pdf
-    },
-    "rent_comps": {
-        "average_rent": CALCULATE_AVERAGE_FROM_Rent_Comps.pdf_IF_EXISTS,
-        "comp_count": COUNT_FROM_Rent_Comps.pdf_IF_EXISTS
-    },
-    "market_data": {
-        "vacancy_rate": EXTRACT_FROM_Market_Report.pdf,
-        "rent_growth_yoy": EXTRACT_FROM_Market_Report.pdf,
-        "median_hh_income": EXTRACT_FROM_Demographics_Overview.pdf,
-        "population_3mi": EXTRACT_FROM_Demographics_Overview.pdf,
-        "transit_score": EXTRACT_FROM_Demographics_Overview.pdf
-    },
-    "projections": {
-        "lease_up_months": null,
-        "stabilization_months": null,
-        "revenue_inflation_rate": null,
-        "expense_inflation_rate": null,
-        "hold_period_months": null,
-        "exit_cap_rate_decimal": null,
-        "sale_cost_percentage": null
-    },
-    "equity_structure": {
-        "gp_pref_rate": null,
-        "lp_pref_rate": null,
-        "promote_percentage": null
-    }
-    }
-    EXAMPLES OF CORRECT EXTRACTION:
-    Example 1 - From your Offering_Memorandum.pdf:
-    "Address: 455 Atlantic Ave, Brooklyn, NY"
-    → "address": "455 Atlantic Ave, Brooklyn, NY"
-    "Property Type: Retail"
-    → "property_type": "Retail"
-    "Units: 7"
-    → "units": 7
-    Example 2 - From your Operating_Expenses_Summary.pdf:
-    "Real Estate Taxes    $91940.2"
-    → "property_taxes": 91940.2
-    "Insurance    $16778.94"
-    → "insurance": 16778.94
-    "Payroll    $44948.21"
-    → "payroll": 44948.21
-    Example 3 - From your Sales_Comps.pdf:
-    "Price/SF" column shows: $880, $919, $673, $894
-    → "average_price_per_sf": 841.5 (average of these 4 values)
-    → "comp_count": 4
-    Example 4 - From your Market_Report.pdf:
-    "Vacancy Rate: 5.71%"
-    → "vacancy_rate": 0.0571
-    "Rent Growth (YoY): 4.18%"
-    → "rent_growth_yoy": 0.0418
-    CRITICAL RULES:
-    1. Use EXACT numbers from the PDFs - don't round or modify
-    2. Convert percentages to decimals (5.71% → 0.0571)
-    3. Remove dollar signs and commas from numbers ($91,940.2 → 91940.2)
-    4. If a field is not in ANY PDF, use null
-    5. Double-check the document name before extracting - make sure you're looking at the right PDF
-    Return ONLY valid JSON with no explanations, comments, or markdown formatting."""
         return prompt
@@ -1606,9 +1659,9 @@ if __name__ == "__main__":
         with gr.Row():
             with gr.Column(scale=2):
                 pdf_input = gr.File(
-                    label="Upload PDF Files",
                     file_count="multiple",
-                    file_types=[".pdf"],
                     type="filepath"
                 )
@@ -1616,13 +1669,17 @@ if __name__ == "__main__":
             with gr.Column(scale=1):
                 gr.Markdown("""
                 ### 📋 Required Documents
-                - Offering Memorandum
-                - Operating Expenses Summary
-                - Sales Comps
-                - Rent Comps
-                - Market Report
-                - Demographics Overview
                 ### ⚡ Features
                 - Automated data extraction

 import tempfile
 import shutil
 from pathlib import Path
+import pandas as pd
+from openpyxl import load_workbook
 """
 Real Estate Financial Model Pipeline
         except Exception as e:
             print(f"Error extracting {pdf_path}: {e}")
             return ""
+    def extract_xlsx_text(self, xlsx_path: str) -> str:
+        """Extract text from XLSX using pandas and openpyxl"""
+        try:
+            extracted_content = []
+            # Try pandas first for data extraction
+            try:
+                xlsx = pd.ExcelFile(xlsx_path)
+                for sheet_name in xlsx.sheet_names:
+                    df = pd.read_excel(xlsx, sheet_name=sheet_name)
+                    extracted_content.append(f"=== Sheet: {sheet_name} ===")
+                    extracted_content.append(df.to_string(index=False))
+                    extracted_content.append("\n")
+            except:
+                pass
+            # Also try openpyxl for cell-level data
+            try:
+                wb = load_workbook(xlsx_path, data_only=True)
+                for sheet in wb.worksheets:
+                    extracted_content.append(f"\n=== Sheet: {sheet.title} (Raw) ===")
+                    for row in sheet.iter_rows(values_only=True):
+                        row_text = " | ".join([str(cell) if cell is not None else "" for cell in row])
+                        if row_text.strip():
+                            extracted_content.append(row_text)
+            except:
+                pass
+            return "\n".join(extracted_content)
+        except Exception as e:
+            print(f"Error extracting {xlsx_path}: {e}")
+            return ""
     def extract_all_pdfs(self, pdf_directory: str) -> Dict[str, str]:
+        """Extract text from all PDFs and XLSX files in directory"""
         pdf_dir = Path(pdf_directory)
         extracted_texts = {}
         with open('output_file_3.txt', "w", encoding="utf-8") as f:
+            # Process PDFs
             for pdf_file in pdf_dir.glob("*.pdf"):
+                print(f"Extracting PDF: {pdf_file.name}")
                 text = self.extract_pdf_text(str(pdf_file))
                 extracted_texts[pdf_file.stem] = text
                 f.write(f"=== {pdf_file.name} ===\n")
                 f.write(text)
                 f.write("\n\n" + "="*80 + "\n\n")
+            # Process XLSX files
+            for xlsx_file in pdf_dir.glob("*.xlsx"):
+                print(f"Extracting XLSX: {xlsx_file.name}")
+                text = self.extract_xlsx_text(str(xlsx_file))
+                extracted_texts[xlsx_file.stem] = text
+                f.write(f"=== {xlsx_file.name} ===\n")
+                f.write(text)
+                f.write("\n\n" + "="*80 + "\n\n")
         self.extracted_data = extracted_texts
         return extracted_texts
         prompt = f"""You are a real estate financial analyst. Extract ALL numerical data from the following PDF texts and return it as a JSON object.
+        CRITICAL INSTRUCTIONS:
+        1. ONLY extract data that is EXPLICITLY stated in the PDFs - DO NOT estimate or make up values
+        2. For missing values, use null (not 0)
+        3. Pay close attention to the specific document names - each contains different information
+        4. Extract exact numbers as they appear in the documents
+        AVAILABLE DOCUMENTS:
+        {pdf_summary}
+        PDF CONTENTS:
+        """
         for name, text in pdf_texts.items():
             prompt += f"\n{'='*60}\n=== {name} ===\n{'='*60}\n{text}\n"
         prompt += """
+        EXTRACTION INSTRUCTIONS BY DOCUMENT:
+        FROM "Offering_Memorandum.pdf":
+        - Extract: Address (full address after "Address:")
+        - Extract: Property Type (after "Property Type:")
+        - Extract: Units (number after "Units:")
+        FROM "Operating_Expenses_Summary.pdf" (if present):
+        - Extract EXACT annual amounts for:
+        * Real Estate Taxes
+        * Insurance
+        * Utilities
+        * Repairs & Maint. (or Repairs & Maintenance)
+        * Management Fee
+        * Payroll
+        * Administrative (if listed)
+        * Professional Fees (if listed)
+        FROM "Sales_Comps.pdf":
+        - Extract all Price/SF values
+        - Calculate average_price_per_sf = average of all Price/SF values
+        - Count total number of comps
+        FROM "Rent_Comps.pdf" (if present):
+        - Extract all rent values (numbers before @ symbol)
+        - Calculate average_rent = average of all rent values
+        - Count total number of rent comps
+        FROM "Market_Report.pdf":
+        - Extract: Vacancy Rate (percentage)
+        - Extract: Rent Growth (YoY) (percentage)
+        FROM "Demographics_Overview.pdf":
+        - Extract: Population (3-mi) - the number
+        - Extract: Median HH Income - the dollar amount
+        - Extract: Transit Score - the number
+        REQUIRED JSON OUTPUT STRUCTURE:
+        {
+        "property_info": {
+            "address": "EXTRACT FROM Offering_Memorandum.pdf",
+            "property_type": "EXTRACT FROM Offering_Memorandum.pdf",
+            "units": EXTRACT_NUMBER_FROM_Offering_Memorandum.pdf,
+            "gross_sf": null,
+            "rentable_sf": null,
+            "retail_sf": null
+        },
+        "acquisition": {
+            "land_value": null,
+            "price": null,
+            "closing_costs": null
+        },
+        "construction": {
+            "construction_cost_per_gsf": null,
+            "construction_months": null
+        },
+        "soft_costs": {
+            "architecture_and_interior_cost": null,
+            "structural_engineering_cost": null,
+            "mep_engineering_cost": null,
+            "civil_engineering_cost": null,
+            "controlled_inspections_cost": null,
+            "surveying_cost": null,
+            "utilities_connection_cost": null,
+            "advertising_and_marketing_cost": null,
+            "accounting_cost": null,
+            "monitoring_cost": null,
+            "ff_and_e_cost": null,
+            "environmental_consultant_fee": null,
+            "miscellaneous_consultants_fee": null,
+            "general_legal_cost": null,
+            "real_estate_taxes_during_construction": null,
+            "miscellaneous_admin_cost": null,
+            "ibr_cost": null,
+            "project_team_cost": null,
+            "pem_fees": null,
+            "bank_fees": null
+        },
+        "financing": {
+            "ltc_ratio": null,
+            "financing_percentage": null,
+            "interest_rate_basis_points": null,
+            "financing_cost": null,
+            "interest_reserve": null
+        },
+        "operating_expenses": {
+            "payroll": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
+            "repairs_and_maintenance": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
+            "utilities": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
+            "administrative": EXTRACT_FROM_Operating_Expenses_Summary.pdf_OR_null,
+            "professional_fees": EXTRACT_FROM_Operating_Expenses_Summary.pdf_OR_null,
+            "insurance": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
+            "property_taxes": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
+            "management_fee_percentage": null
+        },
+        "revenue": {
+            "free_market_rent_psf": null,
+            "affordable_rent_psf": null,
+            "other_income_per_unit": null,
+            "vacancy_rate": null,
+            "retail_rent_psf": null,
+            "parking_income": null
+        },
+        "sales_comps": {
+            "average_price_per_sf": CALCULATE_AVERAGE_FROM_Sales_Comps.pdf,
+            "comp_count": COUNT_FROM_Sales_Comps.pdf
+        },
+        "rent_comps": {
+            "average_rent": CALCULATE_AVERAGE_FROM_Rent_Comps.pdf_IF_EXISTS,
+            "comp_count": COUNT_FROM_Rent_Comps.pdf_IF_EXISTS
+        },
+        "market_data": {
+            "vacancy_rate": EXTRACT_FROM_Market_Report.pdf,
+            "rent_growth_yoy": EXTRACT_FROM_Market_Report.pdf,
+            "median_hh_income": EXTRACT_FROM_Demographics_Overview.pdf,
+            "population_3mi": EXTRACT_FROM_Demographics_Overview.pdf,
+            "transit_score": EXTRACT_FROM_Demographics_Overview.pdf
+        },
+        "projections": {
+            "lease_up_months": null,
+            "stabilization_months": null,
+            "revenue_inflation_rate": null,
+            "expense_inflation_rate": null,
+            "hold_period_months": null,
+            "exit_cap_rate_decimal": null,
+            "sale_cost_percentage": null
+        },
+        "equity_structure": {
+            "gp_pref_rate": null,
+            "lp_pref_rate": null,
+            "promote_percentage": null
+        }
+        }
+        EXAMPLES OF CORRECT EXTRACTION:
+        Example 1 - From your Offering_Memorandum.pdf:
+        "Address: 455 Atlantic Ave, Brooklyn, NY"
+        → "address": "455 Atlantic Ave, Brooklyn, NY"
+        "Property Type: Retail"
+        → "property_type": "Retail"
+        "Units: 7"
+        → "units": 7
+        Example 2 - From your Operating_Expenses_Summary.pdf:
+        "Real Estate Taxes    $91940.2"
+        → "property_taxes": 91940.2
+        "Insurance    $16778.94"
+        → "insurance": 16778.94
+        "Payroll    $44948.21"
+        → "payroll": 44948.21
+        Example 3 - From your Sales_Comps.pdf:
+        "Price/SF" column shows: $880, $919, $673, $894
+        → "average_price_per_sf": 841.5 (average of these 4 values)
+        → "comp_count": 4
+        Example 4 - From your Market_Report.pdf:
+        "Vacancy Rate: 5.71%"
+        → "vacancy_rate": 0.0571
+        "Rent Growth (YoY): 4.18%"
+        → "rent_growth_yoy": 0.0418
+        CRITICAL RULES:
+        1. Use EXACT numbers from the PDFs - don't round or modify
+        2. Convert percentages to decimals (5.71% → 0.0571)
+        3. Remove dollar signs and commas from numbers ($91,940.2 → 91940.2)
+        4. If a field is not in ANY PDF, use null
+        5. Double-check the document name before extracting - make sure you're looking at the right PDF
+        Return ONLY valid JSON with no explanations, comments, or markdown formatting."""
+        prompt += """
+        NOTE: Documents may be in PDF or XLSX format. For XLSX files, data is extracted sheet-by-sheet.
+        Look for numerical data in tables, columns, and labeled cells.
+        PDF AND XLSX CONTENTS:
+        """
         return prompt
         with gr.Row():
             with gr.Column(scale=2):
                 pdf_input = gr.File(
+                    label="Upload PDF/XLSX Files",
                     file_count="multiple",
+                    file_types=[".pdf", ".xlsx", ".xls"],  # Added .xlsx and .xls
                     type="filepath"
                 )
             with gr.Column(scale=1):
                 gr.Markdown("""
+                ### 📋 Supported Formats
+                - **PDF**: Offering Memorandum, Reports
+                - **XLSX/XLS**: Financial statements, data tables
                 ### 📋 Required Documents
+                - Offering Memorandum (PDF/XLSX)
+                - Operating Expenses Summary (PDF/XLSX)
+                - Sales Comps (PDF/XLSX)
+                - Rent Comps (PDF/XLSX)
+                - Market Report (PDF/XLSX)
+                - Demographics Overview (PDF/XLSX)
                 ### ⚡ Features
                 - Automated data extraction