mlbench123 commited on
Commit
f571da5
Β·
verified Β·
1 Parent(s): 6b302fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +445 -0
app.py CHANGED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+ import re
4
+ import json
5
+ import io
6
+ from typing import Dict, List, Tuple, Any
7
+ import traceback
8
+
9
+ class PropertyFormulaAnalyzer:
10
+ def __init__(self, formula_file_path: str = "formulas.txt"):
11
+ """Initialize the analyzer with the formula file path"""
12
+ self.formula_file_path = formula_file_path
13
+ self.formulas = {}
14
+ self.load_formulas()
15
+
16
+ def load_formulas(self):
17
+ """Load and parse all formulas from the formula file"""
18
+ try:
19
+ with open(self.formula_file_path, 'r', encoding='utf-8') as f:
20
+ content = f.read()
21
+
22
+ # Parse formulas using regex
23
+ # Pattern: number. cell_ref (description) = formula
24
+ pattern = r'(\d+)\.\s+([A-Z]+\d+)\s*\(([^)]+)\)\s*=\s*([^=\n]+?)(?=\s+\d+\.|$)'
25
+ matches = re.findall(pattern, content, re.DOTALL)
26
+
27
+ for match in matches:
28
+ formula_num, cell_ref, description, formula = match
29
+ # Clean up the formula
30
+ formula = formula.strip()
31
+ formula = re.sub(r'\s+', ' ', formula)
32
+
33
+ self.formulas[cell_ref] = {
34
+ 'number': formula_num,
35
+ 'description': description.strip(),
36
+ 'formula': formula,
37
+ 'cell_ref': cell_ref
38
+ }
39
+
40
+ print(f"Loaded {len(self.formulas)} formulas from {self.formula_file_path}")
41
+
42
+ except Exception as e:
43
+ print(f"Error loading formulas: {str(e)}")
44
+ traceback.print_exc()
45
+
46
+ def extract_text_from_pdf(self, file_path: str) -> str:
47
+ """Extract text from PDF file"""
48
+ try:
49
+ text = ""
50
+ with open(file_path, 'rb') as file:
51
+ pdf_reader = PyPDF2.PdfReader(file)
52
+ for page in pdf_reader.pages:
53
+ text += page.extract_text() + "\n"
54
+ return text
55
+ except Exception as e:
56
+ print(f"Error extracting PDF: {str(e)}")
57
+ return ""
58
+
59
+ def extract_text_from_txt(self, file_path: str) -> str:
60
+ """Extract text from TXT file"""
61
+ try:
62
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
63
+ return file.read()
64
+ except Exception as e:
65
+ print(f"Error reading TXT: {str(e)}")
66
+ return ""
67
+
68
+ def extract_data_from_files(self, files: List[str]) -> Dict[str, Any]:
69
+ """Extract all relevant data from uploaded property files"""
70
+ combined_text = ""
71
+
72
+ for file_path in files:
73
+ if file_path.lower().endswith('.pdf'):
74
+ combined_text += self.extract_text_from_pdf(file_path) + "\n"
75
+ else:
76
+ combined_text += self.extract_text_from_txt(file_path) + "\n"
77
+
78
+ # Extract data using comprehensive patterns
79
+ extracted_data = {}
80
+
81
+ # Define extraction patterns
82
+ patterns = {
83
+ # Basic property info
84
+ 'UNITS': [r'(?:Total\s+)?Units?\s*:?\s*(\d+)', r'Units\s*(\d+)'],
85
+ 'BUILDING_SF': [r'Building\s+(?:Size|SF)\s*:?\s*([\d,]+)', r'Building\s+(?:Size|SF)\s*(\d+)'],
86
+ 'LOT_ACRES': [r'Lot\s+Size\s*:?\s*([\d.]+)\s*(?:acres?|Acres?)', r'Lot:\s*([\d.]+)\s*acres?'],
87
+ 'LOT_SF': [r'Lot\s+(?:Size\s+)?SF\s*:?\s*([\d,]+)'],
88
+
89
+ # Financial metrics
90
+ 'PRICE': [r'(?:Asking\s+)?Price\s*:?\s*\$\s*([\d,]+)', r'Price\s+per\s+Unit\s*\$\s*([\d,]+)'],
91
+ 'NOI': [r'Net\s+Operating\s+Income\s*(?:\(NOI\))?\s*:?\s*\$?\s*([\d,]+)', r'NOI\s*:?\s*\$?\s*([\d,]+)'],
92
+ 'EGI': [r'Effective\s+Gross\s+Income\s*:?\s*\$?\s*([\d,]+)', r'EGI\s*:?\s*\$?\s*([\d,]+)'],
93
+ 'GPR': [r'Gross\s+Potential\s+Rent\s*(?:\(Annual\))?\s*:?\s*\$?\s*([\d,]+)', r'GPR\s*:?\s*\$?\s*([\d,]+)'],
94
+ 'OPEX': [r'Operating\s+Expenses\s*:?\s*\$?\s*([\d,]+)', r'Total\s+Operating\s+Expenses\s*=?\s*\$?\s*([\d,]+)'],
95
+ 'VACANCY': [r'Vacancy\s*(?:\([\d.]+%\))?\s*:?\s*-?\$?\s*([\d,]+)'],
96
+
97
+ # Operating expenses categories
98
+ 'PROPERTY_TAXES': [r'Property\s+Taxes\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
99
+ 'INSURANCE': [r'Insurance\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
100
+ 'UTILITIES': [r'Utilities\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
101
+ 'REPAIRS_MAINTENANCE': [r'Repairs?\s*(?:&|and)?\s*Maintenance\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
102
+ 'PAYROLL': [r'Payroll\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
103
+ 'ADMINISTRATIVE': [r'Administrative\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
104
+ 'MARKETING': [r'Marketing\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
105
+ 'REPLACEMENT_RESERVES': [r'Replacement\s+Reserves\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
106
+ 'MANAGEMENT_FEE': [r'Management\s*(?:\([^)]+\))?\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
107
+
108
+ # Rates and percentages
109
+ 'CAP_RATE': [r'Cap\s+Rate\s*:?\s*([\d.]+)%?', r'Cap\s+Rate\s+([\d.]+)'],
110
+ 'INTEREST_RATE': [r'Interest\s+Rate\s*:?\s*([\d.]+)%?'],
111
+ 'LTC': [r'Loan[- ]to[- ]Cost\s*(?:\(LTC\))?\s*:?\s*([\d.]+)%?'],
112
+ 'EXIT_CAP_RATE': [r'Exit\s+Cap\s+Rate\s*:?\s*([\d.]+)%?'],
113
+
114
+ # Demographics
115
+ 'MEDIAN_INCOME': [r'Median\s+(?:HH\s+)?Income\s*:?\s*\$?\s*([\d,]+)', r'Median\s+(?:Household\s+)?Income:\s*\$?\s*([\d,]+)'],
116
+ 'POPULATION': [r'Population\s*:?\s*([\d,]+)'],
117
+ 'HOUSEHOLDS': [r'Households\s*:?\s*([\d,]+)'],
118
+ 'RENTER_OCCUPIED_PCT': [r'Renter[- ]Occupied\s*:?\s*([\d.]+)%?'],
119
+
120
+ # Construction & Development
121
+ 'CONSTRUCTION_GMP': [r'(?:Total\s+)?Construction\s+GMP\s*:?\s*\$?\s*([\d,]+)'],
122
+ 'SOFT_COSTS': [r'(?:Total\s+)?Soft\s+Costs?\s*:?\s*\$?\s*([\d,]+)'],
123
+ 'CONTINGENCY': [r'Contingency\s*:?\s*\$?\s*([\d,]+)'],
124
+ 'DEV_FEE': [r'Dev(?:elopment)?\s+Fee\s*:?\s*\$?\s*([\d,]+)'],
125
+
126
+ # Land & Acquisition
127
+ 'LAND_VALUE': [r'(?:Total\s+)?Land\s+Value\s*:?\s*\$?\s*([\d,]+)'],
128
+ 'CLOSING_COSTS': [r'Closing\s+Costs\s*:?\s*\$?\s*([\d,]+)'],
129
+ 'ACQ_FEE': [r'Acq(?:uisition)?\s+Fee\s*:?\s*\$?\s*([\d,]+)'],
130
+ }
131
+
132
+ # Extract values using patterns
133
+ for key, pattern_list in patterns.items():
134
+ for pattern in pattern_list:
135
+ matches = re.findall(pattern, combined_text, re.IGNORECASE)
136
+ if matches:
137
+ try:
138
+ # Take the first match and clean it
139
+ value_str = matches[0].replace(',', '').strip()
140
+ value = float(value_str)
141
+ extracted_data[key] = value
142
+ break
143
+ except (ValueError, IndexError):
144
+ continue
145
+
146
+ # Calculate derived values
147
+ if 'PRICE' in extracted_data and 'UNITS' in extracted_data:
148
+ extracted_data['PRICE_PER_UNIT'] = extracted_data['PRICE'] / extracted_data['UNITS']
149
+
150
+ if 'NOI' in extracted_data and 'PRICE' in extracted_data:
151
+ extracted_data['CALCULATED_CAP_RATE'] = (extracted_data['NOI'] / extracted_data['PRICE']) * 100
152
+
153
+ if 'LTC' in extracted_data and extracted_data['LTC'] > 1:
154
+ extracted_data['LTC'] = extracted_data['LTC'] / 100 # Convert percentage
155
+
156
+ if 'INTEREST_RATE' in extracted_data and extracted_data['INTEREST_RATE'] > 1:
157
+ extracted_data['INTEREST_RATE'] = extracted_data['INTEREST_RATE'] / 100
158
+
159
+ # Add common cell references based on extracted data
160
+ if 'BUILDING_SF' in extracted_data:
161
+ extracted_data['D2'] = extracted_data['BUILDING_SF']
162
+ extracted_data['D$2'] = extracted_data['BUILDING_SF']
163
+ extracted_data['$D$2'] = extracted_data['BUILDING_SF']
164
+
165
+ if 'UNITS' in extracted_data:
166
+ extracted_data['F2'] = extracted_data['UNITS']
167
+ extracted_data['F$2'] = extracted_data['UNITS']
168
+ extracted_data['$F$2'] = extracted_data['UNITS']
169
+
170
+ # Assume RSF is 90% of GSF if not provided
171
+ if 'BUILDING_SF' in extracted_data and 'E2' not in extracted_data:
172
+ extracted_data['E2'] = extracted_data['BUILDING_SF'] * 0.9
173
+ extracted_data['E$2'] = extracted_data['E2']
174
+ extracted_data['$E$2'] = extracted_data['E2']
175
+
176
+ # Map common variables
177
+ if 'LAND_VALUE' in extracted_data:
178
+ extracted_data['C4'] = extracted_data['LAND_VALUE']
179
+ extracted_data['$C4'] = extracted_data['LAND_VALUE']
180
+ extracted_data['$C$4'] = extracted_data['LAND_VALUE']
181
+
182
+ if 'CLOSING_COSTS' in extracted_data:
183
+ extracted_data['C5'] = extracted_data['CLOSING_COSTS']
184
+ extracted_data['$C5'] = extracted_data['CLOSING_COSTS']
185
+
186
+ if 'OPEX' in extracted_data:
187
+ extracted_data['M15'] = extracted_data['OPEX']
188
+ extracted_data['$M$15'] = extracted_data['OPEX']
189
+
190
+ if 'EGI' in extracted_data:
191
+ extracted_data['J38'] = extracted_data['EGI']
192
+ extracted_data['$J$38'] = extracted_data['EGI']
193
+
194
+ return extracted_data
195
+
196
+ def extract_variables_from_formula(self, formula: str) -> List[str]:
197
+ """Extract all variable references from a formula"""
198
+ # Match Excel-style cell references (e.g., C4, $D$2, E2)
199
+ cell_pattern = r'\$?[A-Z]+\$?\d+'
200
+ variables = re.findall(cell_pattern, formula)
201
+
202
+ # Also match named variables
203
+ named_pattern = r'[A-Z_][A-Z0-9_]*'
204
+ named_vars = re.findall(named_pattern, formula)
205
+
206
+ # Filter out Excel functions
207
+ excel_functions = {'SUM', 'PV', 'MIN', 'MAX', 'AVERAGE', 'IF', 'AND', 'OR'}
208
+ named_vars = [v for v in named_vars if v not in excel_functions]
209
+
210
+ return list(set(variables + named_vars))
211
+
212
+ def check_formula_computable(self, formula: str, data: Dict[str, Any]) -> Tuple[bool, List[str]]:
213
+ """Check if a formula can be computed with available data"""
214
+ variables = self.extract_variables_from_formula(formula)
215
+ missing = []
216
+
217
+ for var in variables:
218
+ # Check all variants of the variable
219
+ variants = [var, var.replace('$', ''), var.upper()]
220
+ if not any(v in data for v in variants):
221
+ missing.append(var)
222
+
223
+ return len(missing) == 0, missing
224
+
225
+ def evaluate_formula(self, formula: str, data: Dict[str, Any]) -> Any:
226
+ """Safely evaluate a formula with the provided data"""
227
+ try:
228
+ # Create a safe evaluation environment
229
+ safe_dict = {}
230
+
231
+ # Add all data to the environment
232
+ for key, value in data.items():
233
+ safe_dict[key] = value
234
+ safe_dict[key.replace('$', '')] = value
235
+ safe_dict[key.upper()] = value
236
+
237
+ # Replace Excel functions with Python equivalents
238
+ formula_py = formula
239
+
240
+ # Handle SUM function
241
+ sum_pattern = r'SUM\(([^)]+)\)'
242
+ while re.search(sum_pattern, formula_py):
243
+ match = re.search(sum_pattern, formula_py)
244
+ range_str = match.group(1)
245
+ # For ranges like C4:C6, we'll need to handle them
246
+ if ':' in range_str:
247
+ # Extract the range
248
+ parts = range_str.split(':')
249
+ # For now, we'll just try to add the values if they exist
250
+ formula_py = formula_py.replace(match.group(0), f"sum_range('{range_str}')")
251
+ else:
252
+ formula_py = formula_py.replace(match.group(0), f"sum([{range_str}])")
253
+
254
+ # Handle PV function (present value) - simplified
255
+ pv_pattern = r'PV\([^)]+\)'
256
+ formula_py = re.sub(pv_pattern, '0', formula_py) # Simplified for now
257
+
258
+ # Handle MIN function
259
+ formula_py = re.sub(r'MIN\(([^)]+)\)', r'min([\1])', formula_py)
260
+
261
+ # Replace cell references with their values
262
+ for key in sorted(data.keys(), key=len, reverse=True):
263
+ if key in formula_py:
264
+ formula_py = formula_py.replace(key, str(data[key]))
265
+
266
+ # Replace ^ with ** for exponentiation
267
+ formula_py = formula_py.replace('^', '**')
268
+
269
+ # Evaluate
270
+ result = eval(formula_py, {"__builtins__": {}}, safe_dict)
271
+ return result
272
+
273
+ except Exception as e:
274
+ raise Exception(f"Error evaluating formula: {str(e)}")
275
+
276
+ def process_files(self, files) -> Tuple[str, str, str]:
277
+ """Main processing function for Gradio interface"""
278
+ try:
279
+ if not files:
280
+ return "❌ No files uploaded", "", ""
281
+
282
+ # Extract file paths
283
+ file_paths = [f.name for f in files]
284
+
285
+ # Extract data from all files
286
+ extracted_data = self.extract_data_from_files(file_paths)
287
+
288
+ if not extracted_data:
289
+ return "❌ No data could be extracted from the files", "", ""
290
+
291
+ # Process formulas
292
+ computable_formulas = {}
293
+ non_computable_formulas = {}
294
+
295
+ for cell_ref, formula_info in self.formulas.items():
296
+ formula = formula_info['formula']
297
+ is_computable, missing_vars = self.check_formula_computable(formula, extracted_data)
298
+
299
+ if is_computable:
300
+ try:
301
+ result = self.evaluate_formula(formula, extracted_data)
302
+ computable_formulas[cell_ref] = {
303
+ 'description': formula_info['description'],
304
+ 'formula': formula,
305
+ 'result': result,
306
+ 'formatted_result': f"{result:,.2f}" if isinstance(result, (int, float)) else str(result)
307
+ }
308
+ except Exception as e:
309
+ non_computable_formulas[cell_ref] = {
310
+ 'description': formula_info['description'],
311
+ 'formula': formula,
312
+ 'error': str(e),
313
+ 'missing_variables': []
314
+ }
315
+ else:
316
+ non_computable_formulas[cell_ref] = {
317
+ 'description': formula_info['description'],
318
+ 'formula': formula,
319
+ 'missing_variables': missing_vars
320
+ }
321
+
322
+ # Create summary
323
+ summary = f"""
324
+ ## πŸ“Š Analysis Summary
325
+
326
+ **Total Formulas Loaded:** {len(self.formulas)}
327
+ **βœ… Computable Formulas:** {len(computable_formulas)}
328
+ **❌ Non-Computable Formulas:** {len(non_computable_formulas)}
329
+ **πŸ“„ Files Processed:** {len(file_paths)}
330
+ **πŸ”’ Data Points Extracted:** {len(extracted_data)}
331
+ """
332
+
333
+ # Create extracted data display
334
+ data_display = "## πŸ“₯ Extracted Property Data\n\n"
335
+ data_display += "| Variable | Value |\n|----------|-------|\n"
336
+ for key, value in sorted(extracted_data.items()):
337
+ if isinstance(value, float):
338
+ data_display += f"| {key} | {value:,.2f} |\n"
339
+ else:
340
+ data_display += f"| {key} | {value} |\n"
341
+
342
+ # Create results display
343
+ results_display = "## βœ… Computed Formulas\n\n"
344
+ for cell_ref, info in sorted(computable_formulas.items()):
345
+ results_display += f"### {cell_ref}: {info['description']}\n"
346
+ results_display += f"**Formula:** `{info['formula']}`\n"
347
+ results_display += f"**Result:** {info['formatted_result']}\n\n"
348
+
349
+ if non_computable_formulas:
350
+ results_display += "\n## ❌ Non-Computable Formulas\n\n"
351
+ for cell_ref, info in sorted(non_computable_formulas.items()):
352
+ results_display += f"### {cell_ref}: {info['description']}\n"
353
+ results_display += f"**Formula:** `{info['formula']}`\n"
354
+ if info.get('missing_variables'):
355
+ results_display += f"**Missing Variables:** {', '.join(info['missing_variables'])}\n"
356
+ if info.get('error'):
357
+ results_display += f"**Error:** {info['error']}\n"
358
+ results_display += "\n"
359
+
360
+ # Create JSON output
361
+ json_output = {
362
+ 'summary': {
363
+ 'total_formulas': len(self.formulas),
364
+ 'computable': len(computable_formulas),
365
+ 'non_computable': len(non_computable_formulas),
366
+ 'files_processed': len(file_paths)
367
+ },
368
+ 'extracted_data': extracted_data,
369
+ 'computable_formulas': computable_formulas,
370
+ 'non_computable_formulas': non_computable_formulas
371
+ }
372
+
373
+ json_str = json.dumps(json_output, indent=2)
374
+
375
+ return summary, data_display + "\n\n" + results_display, json_str
376
+
377
+ except Exception as e:
378
+ error_msg = f"❌ Error processing files:\n{str(e)}\n\n{traceback.format_exc()}"
379
+ return error_msg, "", ""
380
+
381
+ # Initialize the analyzer
382
+ analyzer = PropertyFormulaAnalyzer("formulas.txt")
383
+
384
+ # Create Gradio interface
385
+ with gr.Blocks(title="Property Formula Analyzer", theme=gr.themes.Soft()) as app:
386
+ gr.Markdown("""
387
+ # 🏒 Property Formula Analyzer
388
+
389
+ Upload property documents (PDF or TXT) to automatically extract data and compute real estate formulas.
390
+ The system will analyze your documents and calculate all computable formulas based on the extracted data.
391
+ """)
392
+
393
+ with gr.Row():
394
+ with gr.Column():
395
+ file_input = gr.File(
396
+ label="πŸ“ Upload Property Documents",
397
+ file_count="multiple",
398
+ file_types=[".pdf", ".txt"],
399
+ type="filepath"
400
+ )
401
+
402
+ analyze_btn = gr.Button("πŸ” Analyze & Compute Formulas", variant="primary", size="lg")
403
+
404
+ gr.Markdown("""
405
+ ### πŸ“‹ Instructions:
406
+ 1. Upload one or more property documents (PDF or TXT format)
407
+ 2. Click "Analyze & Compute Formulas"
408
+ 3. Review the extracted data and computed formulas
409
+ 4. Download the JSON results for further analysis
410
+ """)
411
+
412
+ with gr.Row():
413
+ with gr.Column():
414
+ summary_output = gr.Markdown(label="Summary")
415
+
416
+ with gr.Row():
417
+ with gr.Column():
418
+ results_output = gr.Markdown(label="Results")
419
+
420
+ with gr.Row():
421
+ with gr.Column():
422
+ json_output = gr.Code(
423
+ label="πŸ“₯ Download Results (JSON)",
424
+ language="json",
425
+ lines=20
426
+ )
427
+
428
+ # Connect the button to the processing function
429
+ analyze_btn.click(
430
+ fn=analyzer.process_files,
431
+ inputs=[file_input],
432
+ outputs=[summary_output, results_output, json_output]
433
+ )
434
+
435
+ gr.Markdown("""
436
+ ---
437
+ ### πŸ“ Notes:
438
+ - The system automatically extracts property metrics like units, price, NOI, operating expenses, etc.
439
+ - Formulas are computed only when all required variables are available in the extracted data
440
+ - Non-computable formulas are listed with their missing variables
441
+ - All results can be downloaded as JSON for further processing
442
+ """)
443
+
444
+ if __name__ == "__main__":
445
+ app.launch()