mlbench123 commited on
Commit
6ca3aa0
Β·
verified Β·
1 Parent(s): b211b3c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +257 -200
app.py CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
2
  import tempfile
3
  import shutil
4
  from pathlib import Path
 
 
5
 
6
  """
7
  Real Estate Financial Model Pipeline
@@ -44,23 +46,66 @@ class RealEstateModelPipeline:
44
  except Exception as e:
45
  print(f"Error extracting {pdf_path}: {e}")
46
  return ""
47
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def extract_all_pdfs(self, pdf_directory: str) -> Dict[str, str]:
49
- """Extract text from all PDFs in directory"""
50
  pdf_dir = Path(pdf_directory)
51
  extracted_texts = {}
52
 
53
  with open('output_file_3.txt', "w", encoding="utf-8") as f:
 
54
  for pdf_file in pdf_dir.glob("*.pdf"):
55
- print(f"Extracting: {pdf_file.name}")
56
  text = self.extract_pdf_text(str(pdf_file))
57
  extracted_texts[pdf_file.stem] = text
58
-
59
- # Write each PDF’s name and extracted text to file
60
  f.write(f"=== {pdf_file.name} ===\n")
61
  f.write(text)
62
  f.write("\n\n" + "="*80 + "\n\n")
63
-
 
 
 
 
 
 
 
 
 
 
64
  self.extracted_data = extracted_texts
65
 
66
  return extracted_texts
@@ -85,198 +130,206 @@ class RealEstateModelPipeline:
85
 
86
  prompt = f"""You are a real estate financial analyst. Extract ALL numerical data from the following PDF texts and return it as a JSON object.
87
 
88
- CRITICAL INSTRUCTIONS:
89
- 1. ONLY extract data that is EXPLICITLY stated in the PDFs - DO NOT estimate or make up values
90
- 2. For missing values, use null (not 0)
91
- 3. Pay close attention to the specific document names - each contains different information
92
- 4. Extract exact numbers as they appear in the documents
93
-
94
- AVAILABLE DOCUMENTS:
95
- {pdf_summary}
96
-
97
- PDF CONTENTS:
98
- """
99
  for name, text in pdf_texts.items():
100
  prompt += f"\n{'='*60}\n=== {name} ===\n{'='*60}\n{text}\n"
101
 
102
  prompt += """
103
 
104
- EXTRACTION INSTRUCTIONS BY DOCUMENT:
105
-
106
- FROM "Offering_Memorandum.pdf":
107
- - Extract: Address (full address after "Address:")
108
- - Extract: Property Type (after "Property Type:")
109
- - Extract: Units (number after "Units:")
110
-
111
- FROM "Operating_Expenses_Summary.pdf" (if present):
112
- - Extract EXACT annual amounts for:
113
- * Real Estate Taxes
114
- * Insurance
115
- * Utilities
116
- * Repairs & Maint. (or Repairs & Maintenance)
117
- * Management Fee
118
- * Payroll
119
- * Administrative (if listed)
120
- * Professional Fees (if listed)
121
-
122
- FROM "Sales_Comps.pdf":
123
- - Extract all Price/SF values
124
- - Calculate average_price_per_sf = average of all Price/SF values
125
- - Count total number of comps
126
-
127
- FROM "Rent_Comps.pdf" (if present):
128
- - Extract all rent values (numbers before @ symbol)
129
- - Calculate average_rent = average of all rent values
130
- - Count total number of rent comps
131
-
132
- FROM "Market_Report.pdf":
133
- - Extract: Vacancy Rate (percentage)
134
- - Extract: Rent Growth (YoY) (percentage)
135
-
136
- FROM "Demographics_Overview.pdf":
137
- - Extract: Population (3-mi) - the number
138
- - Extract: Median HH Income - the dollar amount
139
- - Extract: Transit Score - the number
140
-
141
- REQUIRED JSON OUTPUT STRUCTURE:
142
- {
143
- "property_info": {
144
- "address": "EXTRACT FROM Offering_Memorandum.pdf",
145
- "property_type": "EXTRACT FROM Offering_Memorandum.pdf",
146
- "units": EXTRACT_NUMBER_FROM_Offering_Memorandum.pdf,
147
- "gross_sf": null,
148
- "rentable_sf": null,
149
- "retail_sf": null
150
- },
151
- "acquisition": {
152
- "land_value": null,
153
- "price": null,
154
- "closing_costs": null
155
- },
156
- "construction": {
157
- "construction_cost_per_gsf": null,
158
- "construction_months": null
159
- },
160
- "soft_costs": {
161
- "architecture_and_interior_cost": null,
162
- "structural_engineering_cost": null,
163
- "mep_engineering_cost": null,
164
- "civil_engineering_cost": null,
165
- "controlled_inspections_cost": null,
166
- "surveying_cost": null,
167
- "utilities_connection_cost": null,
168
- "advertising_and_marketing_cost": null,
169
- "accounting_cost": null,
170
- "monitoring_cost": null,
171
- "ff_and_e_cost": null,
172
- "environmental_consultant_fee": null,
173
- "miscellaneous_consultants_fee": null,
174
- "general_legal_cost": null,
175
- "real_estate_taxes_during_construction": null,
176
- "miscellaneous_admin_cost": null,
177
- "ibr_cost": null,
178
- "project_team_cost": null,
179
- "pem_fees": null,
180
- "bank_fees": null
181
- },
182
- "financing": {
183
- "ltc_ratio": null,
184
- "financing_percentage": null,
185
- "interest_rate_basis_points": null,
186
- "financing_cost": null,
187
- "interest_reserve": null
188
- },
189
- "operating_expenses": {
190
- "payroll": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
191
- "repairs_and_maintenance": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
192
- "utilities": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
193
- "administrative": EXTRACT_FROM_Operating_Expenses_Summary.pdf_OR_null,
194
- "professional_fees": EXTRACT_FROM_Operating_Expenses_Summary.pdf_OR_null,
195
- "insurance": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
196
- "property_taxes": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
197
- "management_fee_percentage": null
198
- },
199
- "revenue": {
200
- "free_market_rent_psf": null,
201
- "affordable_rent_psf": null,
202
- "other_income_per_unit": null,
203
- "vacancy_rate": null,
204
- "retail_rent_psf": null,
205
- "parking_income": null
206
- },
207
- "sales_comps": {
208
- "average_price_per_sf": CALCULATE_AVERAGE_FROM_Sales_Comps.pdf,
209
- "comp_count": COUNT_FROM_Sales_Comps.pdf
210
- },
211
- "rent_comps": {
212
- "average_rent": CALCULATE_AVERAGE_FROM_Rent_Comps.pdf_IF_EXISTS,
213
- "comp_count": COUNT_FROM_Rent_Comps.pdf_IF_EXISTS
214
- },
215
- "market_data": {
216
- "vacancy_rate": EXTRACT_FROM_Market_Report.pdf,
217
- "rent_growth_yoy": EXTRACT_FROM_Market_Report.pdf,
218
- "median_hh_income": EXTRACT_FROM_Demographics_Overview.pdf,
219
- "population_3mi": EXTRACT_FROM_Demographics_Overview.pdf,
220
- "transit_score": EXTRACT_FROM_Demographics_Overview.pdf
221
- },
222
- "projections": {
223
- "lease_up_months": null,
224
- "stabilization_months": null,
225
- "revenue_inflation_rate": null,
226
- "expense_inflation_rate": null,
227
- "hold_period_months": null,
228
- "exit_cap_rate_decimal": null,
229
- "sale_cost_percentage": null
230
- },
231
- "equity_structure": {
232
- "gp_pref_rate": null,
233
- "lp_pref_rate": null,
234
- "promote_percentage": null
235
- }
236
- }
237
-
238
- EXAMPLES OF CORRECT EXTRACTION:
239
-
240
- Example 1 - From your Offering_Memorandum.pdf:
241
- "Address: 455 Atlantic Ave, Brooklyn, NY"
242
- β†’ "address": "455 Atlantic Ave, Brooklyn, NY"
243
-
244
- "Property Type: Retail"
245
- β†’ "property_type": "Retail"
246
-
247
- "Units: 7"
248
- β†’ "units": 7
249
-
250
- Example 2 - From your Operating_Expenses_Summary.pdf:
251
- "Real Estate Taxes $91940.2"
252
- β†’ "property_taxes": 91940.2
253
-
254
- "Insurance $16778.94"
255
- β†’ "insurance": 16778.94
256
-
257
- "Payroll $44948.21"
258
- β†’ "payroll": 44948.21
259
-
260
- Example 3 - From your Sales_Comps.pdf:
261
- "Price/SF" column shows: $880, $919, $673, $894
262
- β†’ "average_price_per_sf": 841.5 (average of these 4 values)
263
- β†’ "comp_count": 4
264
-
265
- Example 4 - From your Market_Report.pdf:
266
- "Vacancy Rate: 5.71%"
267
- β†’ "vacancy_rate": 0.0571
268
-
269
- "Rent Growth (YoY): 4.18%"
270
- β†’ "rent_growth_yoy": 0.0418
271
-
272
- CRITICAL RULES:
273
- 1. Use EXACT numbers from the PDFs - don't round or modify
274
- 2. Convert percentages to decimals (5.71% β†’ 0.0571)
275
- 3. Remove dollar signs and commas from numbers ($91,940.2 β†’ 91940.2)
276
- 4. If a field is not in ANY PDF, use null
277
- 5. Double-check the document name before extracting - make sure you're looking at the right PDF
 
 
278
 
279
- Return ONLY valid JSON with no explanations, comments, or markdown formatting."""
 
 
 
 
 
 
280
 
281
  return prompt
282
 
@@ -1606,9 +1659,9 @@ if __name__ == "__main__":
1606
  with gr.Row():
1607
  with gr.Column(scale=2):
1608
  pdf_input = gr.File(
1609
- label="Upload PDF Files",
1610
  file_count="multiple",
1611
- file_types=[".pdf"],
1612
  type="filepath"
1613
  )
1614
 
@@ -1616,13 +1669,17 @@ if __name__ == "__main__":
1616
 
1617
  with gr.Column(scale=1):
1618
  gr.Markdown("""
 
 
 
 
1619
  ### πŸ“‹ Required Documents
1620
- - Offering Memorandum
1621
- - Operating Expenses Summary
1622
- - Sales Comps
1623
- - Rent Comps
1624
- - Market Report
1625
- - Demographics Overview
1626
 
1627
  ### ⚑ Features
1628
  - Automated data extraction
 
2
  import tempfile
3
  import shutil
4
  from pathlib import Path
5
+ import pandas as pd
6
+ from openpyxl import load_workbook
7
 
8
  """
9
  Real Estate Financial Model Pipeline
 
46
  except Exception as e:
47
  print(f"Error extracting {pdf_path}: {e}")
48
  return ""
49
+
50
+ def extract_xlsx_text(self, xlsx_path: str) -> str:
51
+ """Extract text from XLSX using pandas and openpyxl"""
52
+ try:
53
+ extracted_content = []
54
+
55
+ # Try pandas first for data extraction
56
+ try:
57
+ xlsx = pd.ExcelFile(xlsx_path)
58
+ for sheet_name in xlsx.sheet_names:
59
+ df = pd.read_excel(xlsx, sheet_name=sheet_name)
60
+ extracted_content.append(f"=== Sheet: {sheet_name} ===")
61
+ extracted_content.append(df.to_string(index=False))
62
+ extracted_content.append("\n")
63
+ except:
64
+ pass
65
+
66
+ # Also try openpyxl for cell-level data
67
+ try:
68
+ wb = load_workbook(xlsx_path, data_only=True)
69
+ for sheet in wb.worksheets:
70
+ extracted_content.append(f"\n=== Sheet: {sheet.title} (Raw) ===")
71
+ for row in sheet.iter_rows(values_only=True):
72
+ row_text = " | ".join([str(cell) if cell is not None else "" for cell in row])
73
+ if row_text.strip():
74
+ extracted_content.append(row_text)
75
+ except:
76
+ pass
77
+
78
+ return "\n".join(extracted_content)
79
+ except Exception as e:
80
+ print(f"Error extracting {xlsx_path}: {e}")
81
+ return ""
82
+
83
  def extract_all_pdfs(self, pdf_directory: str) -> Dict[str, str]:
84
+ """Extract text from all PDFs and XLSX files in directory"""
85
  pdf_dir = Path(pdf_directory)
86
  extracted_texts = {}
87
 
88
  with open('output_file_3.txt', "w", encoding="utf-8") as f:
89
+ # Process PDFs
90
  for pdf_file in pdf_dir.glob("*.pdf"):
91
+ print(f"Extracting PDF: {pdf_file.name}")
92
  text = self.extract_pdf_text(str(pdf_file))
93
  extracted_texts[pdf_file.stem] = text
94
+
 
95
  f.write(f"=== {pdf_file.name} ===\n")
96
  f.write(text)
97
  f.write("\n\n" + "="*80 + "\n\n")
98
+
99
+ # Process XLSX files
100
+ for xlsx_file in pdf_dir.glob("*.xlsx"):
101
+ print(f"Extracting XLSX: {xlsx_file.name}")
102
+ text = self.extract_xlsx_text(str(xlsx_file))
103
+ extracted_texts[xlsx_file.stem] = text
104
+
105
+ f.write(f"=== {xlsx_file.name} ===\n")
106
+ f.write(text)
107
+ f.write("\n\n" + "="*80 + "\n\n")
108
+
109
  self.extracted_data = extracted_texts
110
 
111
  return extracted_texts
 
130
 
131
  prompt = f"""You are a real estate financial analyst. Extract ALL numerical data from the following PDF texts and return it as a JSON object.
132
 
133
+ CRITICAL INSTRUCTIONS:
134
+ 1. ONLY extract data that is EXPLICITLY stated in the PDFs - DO NOT estimate or make up values
135
+ 2. For missing values, use null (not 0)
136
+ 3. Pay close attention to the specific document names - each contains different information
137
+ 4. Extract exact numbers as they appear in the documents
138
+
139
+ AVAILABLE DOCUMENTS:
140
+ {pdf_summary}
141
+
142
+ PDF CONTENTS:
143
+ """
144
  for name, text in pdf_texts.items():
145
  prompt += f"\n{'='*60}\n=== {name} ===\n{'='*60}\n{text}\n"
146
 
147
  prompt += """
148
 
149
+ EXTRACTION INSTRUCTIONS BY DOCUMENT:
150
+
151
+ FROM "Offering_Memorandum.pdf":
152
+ - Extract: Address (full address after "Address:")
153
+ - Extract: Property Type (after "Property Type:")
154
+ - Extract: Units (number after "Units:")
155
+
156
+ FROM "Operating_Expenses_Summary.pdf" (if present):
157
+ - Extract EXACT annual amounts for:
158
+ * Real Estate Taxes
159
+ * Insurance
160
+ * Utilities
161
+ * Repairs & Maint. (or Repairs & Maintenance)
162
+ * Management Fee
163
+ * Payroll
164
+ * Administrative (if listed)
165
+ * Professional Fees (if listed)
166
+
167
+ FROM "Sales_Comps.pdf":
168
+ - Extract all Price/SF values
169
+ - Calculate average_price_per_sf = average of all Price/SF values
170
+ - Count total number of comps
171
+
172
+ FROM "Rent_Comps.pdf" (if present):
173
+ - Extract all rent values (numbers before @ symbol)
174
+ - Calculate average_rent = average of all rent values
175
+ - Count total number of rent comps
176
+
177
+ FROM "Market_Report.pdf":
178
+ - Extract: Vacancy Rate (percentage)
179
+ - Extract: Rent Growth (YoY) (percentage)
180
+
181
+ FROM "Demographics_Overview.pdf":
182
+ - Extract: Population (3-mi) - the number
183
+ - Extract: Median HH Income - the dollar amount
184
+ - Extract: Transit Score - the number
185
+
186
+ REQUIRED JSON OUTPUT STRUCTURE:
187
+ {
188
+ "property_info": {
189
+ "address": "EXTRACT FROM Offering_Memorandum.pdf",
190
+ "property_type": "EXTRACT FROM Offering_Memorandum.pdf",
191
+ "units": EXTRACT_NUMBER_FROM_Offering_Memorandum.pdf,
192
+ "gross_sf": null,
193
+ "rentable_sf": null,
194
+ "retail_sf": null
195
+ },
196
+ "acquisition": {
197
+ "land_value": null,
198
+ "price": null,
199
+ "closing_costs": null
200
+ },
201
+ "construction": {
202
+ "construction_cost_per_gsf": null,
203
+ "construction_months": null
204
+ },
205
+ "soft_costs": {
206
+ "architecture_and_interior_cost": null,
207
+ "structural_engineering_cost": null,
208
+ "mep_engineering_cost": null,
209
+ "civil_engineering_cost": null,
210
+ "controlled_inspections_cost": null,
211
+ "surveying_cost": null,
212
+ "utilities_connection_cost": null,
213
+ "advertising_and_marketing_cost": null,
214
+ "accounting_cost": null,
215
+ "monitoring_cost": null,
216
+ "ff_and_e_cost": null,
217
+ "environmental_consultant_fee": null,
218
+ "miscellaneous_consultants_fee": null,
219
+ "general_legal_cost": null,
220
+ "real_estate_taxes_during_construction": null,
221
+ "miscellaneous_admin_cost": null,
222
+ "ibr_cost": null,
223
+ "project_team_cost": null,
224
+ "pem_fees": null,
225
+ "bank_fees": null
226
+ },
227
+ "financing": {
228
+ "ltc_ratio": null,
229
+ "financing_percentage": null,
230
+ "interest_rate_basis_points": null,
231
+ "financing_cost": null,
232
+ "interest_reserve": null
233
+ },
234
+ "operating_expenses": {
235
+ "payroll": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
236
+ "repairs_and_maintenance": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
237
+ "utilities": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
238
+ "administrative": EXTRACT_FROM_Operating_Expenses_Summary.pdf_OR_null,
239
+ "professional_fees": EXTRACT_FROM_Operating_Expenses_Summary.pdf_OR_null,
240
+ "insurance": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
241
+ "property_taxes": EXTRACT_FROM_Operating_Expenses_Summary.pdf,
242
+ "management_fee_percentage": null
243
+ },
244
+ "revenue": {
245
+ "free_market_rent_psf": null,
246
+ "affordable_rent_psf": null,
247
+ "other_income_per_unit": null,
248
+ "vacancy_rate": null,
249
+ "retail_rent_psf": null,
250
+ "parking_income": null
251
+ },
252
+ "sales_comps": {
253
+ "average_price_per_sf": CALCULATE_AVERAGE_FROM_Sales_Comps.pdf,
254
+ "comp_count": COUNT_FROM_Sales_Comps.pdf
255
+ },
256
+ "rent_comps": {
257
+ "average_rent": CALCULATE_AVERAGE_FROM_Rent_Comps.pdf_IF_EXISTS,
258
+ "comp_count": COUNT_FROM_Rent_Comps.pdf_IF_EXISTS
259
+ },
260
+ "market_data": {
261
+ "vacancy_rate": EXTRACT_FROM_Market_Report.pdf,
262
+ "rent_growth_yoy": EXTRACT_FROM_Market_Report.pdf,
263
+ "median_hh_income": EXTRACT_FROM_Demographics_Overview.pdf,
264
+ "population_3mi": EXTRACT_FROM_Demographics_Overview.pdf,
265
+ "transit_score": EXTRACT_FROM_Demographics_Overview.pdf
266
+ },
267
+ "projections": {
268
+ "lease_up_months": null,
269
+ "stabilization_months": null,
270
+ "revenue_inflation_rate": null,
271
+ "expense_inflation_rate": null,
272
+ "hold_period_months": null,
273
+ "exit_cap_rate_decimal": null,
274
+ "sale_cost_percentage": null
275
+ },
276
+ "equity_structure": {
277
+ "gp_pref_rate": null,
278
+ "lp_pref_rate": null,
279
+ "promote_percentage": null
280
+ }
281
+ }
282
+
283
+ EXAMPLES OF CORRECT EXTRACTION:
284
+
285
+ Example 1 - From your Offering_Memorandum.pdf:
286
+ "Address: 455 Atlantic Ave, Brooklyn, NY"
287
+ β†’ "address": "455 Atlantic Ave, Brooklyn, NY"
288
+
289
+ "Property Type: Retail"
290
+ β†’ "property_type": "Retail"
291
+
292
+ "Units: 7"
293
+ β†’ "units": 7
294
+
295
+ Example 2 - From your Operating_Expenses_Summary.pdf:
296
+ "Real Estate Taxes $91940.2"
297
+ β†’ "property_taxes": 91940.2
298
+
299
+ "Insurance $16778.94"
300
+ β†’ "insurance": 16778.94
301
+
302
+ "Payroll $44948.21"
303
+ β†’ "payroll": 44948.21
304
+
305
+ Example 3 - From your Sales_Comps.pdf:
306
+ "Price/SF" column shows: $880, $919, $673, $894
307
+ β†’ "average_price_per_sf": 841.5 (average of these 4 values)
308
+ β†’ "comp_count": 4
309
+
310
+ Example 4 - From your Market_Report.pdf:
311
+ "Vacancy Rate: 5.71%"
312
+ β†’ "vacancy_rate": 0.0571
313
+
314
+ "Rent Growth (YoY): 4.18%"
315
+ β†’ "rent_growth_yoy": 0.0418
316
+
317
+ CRITICAL RULES:
318
+ 1. Use EXACT numbers from the PDFs - don't round or modify
319
+ 2. Convert percentages to decimals (5.71% β†’ 0.0571)
320
+ 3. Remove dollar signs and commas from numbers ($91,940.2 β†’ 91940.2)
321
+ 4. If a field is not in ANY PDF, use null
322
+ 5. Double-check the document name before extracting - make sure you're looking at the right PDF
323
+
324
+ Return ONLY valid JSON with no explanations, comments, or markdown formatting."""
325
 
326
+ prompt += """
327
+
328
+ NOTE: Documents may be in PDF or XLSX format. For XLSX files, data is extracted sheet-by-sheet.
329
+ Look for numerical data in tables, columns, and labeled cells.
330
+
331
+ PDF AND XLSX CONTENTS:
332
+ """
333
 
334
  return prompt
335
 
 
1659
  with gr.Row():
1660
  with gr.Column(scale=2):
1661
  pdf_input = gr.File(
1662
+ label="Upload PDF/XLSX Files",
1663
  file_count="multiple",
1664
+ file_types=[".pdf", ".xlsx", ".xls"], # Added .xlsx and .xls
1665
  type="filepath"
1666
  )
1667
 
 
1669
 
1670
  with gr.Column(scale=1):
1671
  gr.Markdown("""
1672
+ ### πŸ“‹ Supported Formats
1673
+ - **PDF**: Offering Memorandum, Reports
1674
+ - **XLSX/XLS**: Financial statements, data tables
1675
+
1676
  ### πŸ“‹ Required Documents
1677
+ - Offering Memorandum (PDF/XLSX)
1678
+ - Operating Expenses Summary (PDF/XLSX)
1679
+ - Sales Comps (PDF/XLSX)
1680
+ - Rent Comps (PDF/XLSX)
1681
+ - Market Report (PDF/XLSX)
1682
+ - Demographics Overview (PDF/XLSX)
1683
 
1684
  ### ⚑ Features
1685
  - Automated data extraction