mlbench123 commited on
Commit
e55d55f
Β·
verified Β·
1 Parent(s): bdd40b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -188
app.py CHANGED
@@ -2,44 +2,68 @@ import gradio as gr
2
  import PyPDF2
3
  import re
4
  import json
5
- import io
6
  from typing import Dict, List, Tuple, Any
7
  import traceback
8
 
9
- class PropertyFormulaAnalyzer:
10
  def __init__(self, formula_file_path: str = "formulas.txt"):
11
- """Initialize the analyzer with the formula file path"""
12
  self.formula_file_path = formula_file_path
13
  self.formulas = {}
14
- self.computed_values = {} # Store computed values for cascading calculations
15
  self.load_formulas()
16
 
17
  def load_formulas(self):
18
- """Load and parse all formulas from the formula file"""
19
  try:
20
  with open(self.formula_file_path, 'r', encoding='utf-8') as f:
21
  content = f.read()
22
 
23
- # Parse formulas using regex
24
- pattern = r'(\d+)\.\s+([A-Z]+\d+)\s*\(([^)]+)\)\s*=\s*([^=\n]+?)(?=\s+\d+\.|$)'
25
- matches = re.findall(pattern, content, re.DOTALL)
26
 
27
- for match in matches:
28
- formula_num, cell_ref, description, formula = match
29
- formula = formula.strip()
30
- formula = re.sub(r'\s+', ' ', formula)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- self.formulas[cell_ref] = {
33
- 'number': formula_num,
34
- 'description': description.strip(),
35
- 'formula': formula,
36
- 'cell_ref': cell_ref
 
 
 
 
37
  }
38
 
39
- print(f"Loaded {len(self.formulas)} formulas from {self.formula_file_path}")
40
 
41
  except Exception as e:
42
- print(f"Error loading formulas: {str(e)}")
43
  traceback.print_exc()
44
 
45
  def extract_text_from_pdf(self, file_path: str) -> str:
@@ -65,7 +89,7 @@ class PropertyFormulaAnalyzer:
65
  return ""
66
 
67
  def extract_data_from_files(self, files: List[str]) -> Dict[str, Any]:
68
- """Extract all relevant data from uploaded property files"""
69
  combined_text = ""
70
 
71
  for file_path in files:
@@ -76,34 +100,74 @@ class PropertyFormulaAnalyzer:
76
 
77
  extracted_data = {}
78
 
79
- # Define extraction patterns
80
  patterns = {
81
- 'UNITS': [r'(?:Total\s+)?Units?\s*:?\s*(\d+)', r'Units\s*(\d+)'],
82
- 'BUILDING_SF': [r'Building\s+(?:Size|SF)\s*:?\s*([\d,]+)', r'Building\s+(?:Size|SF)\s*(\d+)'],
83
- 'LOT_ACRES': [r'Lot\s+Size\s*:?\s*([\d.]+)\s*(?:acres?|Acres?)', r'Lot:\s*([\d.]+)\s*acres?'],
84
- 'PRICE': [r'(?:Asking\s+)?Price\s*:?\s*\$\s*([\d,]+)', r'Price\s+per\s+Unit\s*\$\s*([\d,]+)'],
85
- 'NOI': [r'Net\s+Operating\s+Income\s*(?:\(NOI\))?\s*:?\s*\$?\s*([\d,]+)', r'NOI\s*:?\s*\$?\s*([\d,]+)'],
86
- 'EGI': [r'Effective\s+Gross\s+Income\s*:?\s*\$?\s*([\d,]+)', r'EGI\s*:?\s*\$?\s*([\d,]+)'],
87
- 'GPR': [r'Gross\s+Potential\s+Rent\s*(?:\(Annual\))?\s*:?\s*\$?\s*([\d,]+)', r'GPR\s*:?\s*\$?\s*([\d,]+)'],
88
- 'OPEX': [r'Operating\s+Expenses\s*:?\s*\$?\s*([\d,]+)', r'Total\s+Operating\s+Expenses\s*=?\s*\$?\s*([\d,]+)'],
 
 
 
 
 
 
 
 
89
  'VACANCY': [r'Vacancy\s*(?:\([\d.]+%\))?\s*:?\s*-?\$?\s*([\d,]+)'],
 
 
 
 
 
 
90
  'PROPERTY_TAXES': [r'Property\s+Taxes\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
 
91
  'INSURANCE': [r'Insurance\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
92
  'UTILITIES': [r'Utilities\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
93
- 'REPAIRS_MAINTENANCE': [r'Repairs?\s*(?:&|and)?\s*Maintenance\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
94
  'PAYROLL': [r'Payroll\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
95
  'ADMINISTRATIVE': [r'Administrative\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
96
  'MARKETING': [r'Marketing\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
 
97
  'REPLACEMENT_RESERVES': [r'Replacement\s+Reserves\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
98
  'MANAGEMENT_FEE': [r'Management\s*(?:\([^)]+\))?\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
99
- 'CAP_RATE': [r'Cap\s+Rate\s*:?\s*([\d.]+)%?', r'Cap\s+Rate\s+([\d.]+)'],
 
 
 
 
100
  'INTEREST_RATE': [r'Interest\s+Rate\s*:?\s*([\d.]+)%?'],
 
101
  'LTC': [r'Loan[- ]to[- ]Cost\s*(?:\(LTC\))?\s*:?\s*([\d.]+)%?'],
 
102
  'EXIT_CAP_RATE': [r'Exit\s+Cap\s+Rate\s*:?\s*([\d.]+)%?'],
 
 
 
103
  'MEDIAN_INCOME': [r'Median\s+(?:HH\s+)?Income\s*:?\s*\$?\s*([\d,]+)'],
104
  'POPULATION': [r'Population\s*:?\s*([\d,]+)'],
105
  'HOUSEHOLDS': [r'Households\s*:?\s*([\d,]+)'],
106
  'RENTER_OCCUPIED_PCT': [r'Renter[- ]Occupied\s*:?\s*([\d.]+)%?'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  }
108
 
109
  for key, pattern_list in patterns.items():
@@ -118,138 +182,127 @@ class PropertyFormulaAnalyzer:
118
  except (ValueError, IndexError):
119
  continue
120
 
121
- # Derived values
122
- if 'PRICE' in extracted_data and 'UNITS' in extracted_data:
123
- extracted_data['PRICE_PER_UNIT'] = extracted_data['PRICE'] / extracted_data['UNITS']
 
 
124
 
125
- if 'NOI' in extracted_data and 'PRICE' in extracted_data:
126
- extracted_data['CALCULATED_CAP_RATE'] = (extracted_data['NOI'] / extracted_data['PRICE']) * 100
 
 
127
 
128
- if 'LTC' in extracted_data and extracted_data['LTC'] > 1:
129
- extracted_data['LTC'] = extracted_data['LTC'] / 100
130
 
131
- if 'INTEREST_RATE' in extracted_data and extracted_data['INTEREST_RATE'] > 1:
132
- extracted_data['INTEREST_RATE'] = extracted_data['INTEREST_RATE'] / 100
 
 
 
133
 
134
- # Map to cell references
135
- if 'BUILDING_SF' in extracted_data:
136
- extracted_data['D2'] = extracted_data['BUILDING_SF']
137
 
138
- if 'UNITS' in extracted_data:
139
- extracted_data['F2'] = extracted_data['UNITS']
 
140
 
141
- if 'BUILDING_SF' in extracted_data:
142
- extracted_data['E2'] = extracted_data['BUILDING_SF'] * 0.9
143
 
144
- if 'OPEX' in extracted_data:
145
- extracted_data['M15'] = extracted_data['OPEX']
 
146
 
147
- if 'EGI' in extracted_data:
148
- extracted_data['J38'] = extracted_data['EGI']
 
149
 
150
- return extracted_data
151
-
152
- def normalize_cell_ref(self, cell_ref: str) -> str:
153
- """Normalize cell reference by removing $ signs"""
154
- return cell_ref.replace('$', '')
155
-
156
- def get_value(self, var: str, data: Dict[str, Any]) -> Any:
157
- """Get value for a variable, handling all variants"""
158
- # Try exact match
159
- if var in data:
160
- return data[var]
161
 
162
- # Try normalized (without $)
163
- normalized = self.normalize_cell_ref(var)
164
- if normalized in data:
165
- return data[normalized]
166
 
167
- # Try with computed values
168
- if var in self.computed_values:
169
- return self.computed_values[var]
170
 
171
- if normalized in self.computed_values:
172
- return self.computed_values[normalized]
 
173
 
174
- return None
 
 
 
 
 
 
 
175
 
176
  def extract_variables_from_formula(self, formula: str) -> List[str]:
177
- """Extract all variable references from a formula"""
178
- # Match Excel-style cell references (e.g., C4, $D$2, E2)
179
- cell_pattern = r'\$?[A-Z]+\$?\d+'
180
- variables = re.findall(cell_pattern, formula)
 
181
 
182
- # Remove Excel functions and operators
183
- excel_functions = {'SUM', 'PV', 'MIN', 'MAX', 'AVERAGE', 'IF', 'AND', 'OR'}
184
- variables = [v for v in variables if v not in excel_functions]
185
 
186
  return list(set(variables))
187
 
188
  def check_formula_computable(self, formula: str, data: Dict[str, Any]) -> Tuple[bool, List[str]]:
189
- """Check if a formula can be computed with available data"""
190
  variables = self.extract_variables_from_formula(formula)
191
  missing = []
192
 
193
  for var in variables:
194
- if self.get_value(var, data) is None:
195
  missing.append(var)
196
 
197
  return len(missing) == 0, missing
198
 
199
  def safe_eval_formula(self, formula: str, data: Dict[str, Any]) -> Any:
200
- """Safely evaluate a formula with the provided data"""
201
  try:
202
- formula_py = formula
203
-
204
- # Handle SUM function with ranges
205
- def process_sum_range(match):
206
- range_str = match.group(1)
207
- if ':' in range_str:
208
- # For now, return 0 for ranges we can't process
209
- return '0'
210
- else:
211
- # Individual cells
212
- cells = [c.strip() for c in range_str.split(',')]
213
- values = []
214
- for cell in cells:
215
- val = self.get_value(cell, data)
216
- if val is not None:
217
- values.append(str(val))
218
- if values:
219
- return f"({'+'.join(values)})"
220
- return '0'
221
-
222
- sum_pattern = r'SUM\(([^)]+)\)'
223
- formula_py = re.sub(sum_pattern, process_sum_range, formula_py)
224
-
225
- # Handle PV function - simplified to 0
226
- formula_py = re.sub(r'PV\([^)]+\)', '0', formula_py)
227
-
228
- # Handle MIN function
229
- formula_py = re.sub(r'MIN\(([^)]+)\)', r'min([\1])', formula_py)
230
-
231
- # Replace cell references with their values
232
- variables = self.extract_variables_from_formula(formula_py)
233
  for var in sorted(variables, key=len, reverse=True):
234
- value = self.get_value(var, data)
235
- if value is not None:
236
- formula_py = formula_py.replace(var, str(value))
237
 
238
- # Replace ^ with ** for exponentiation
239
- formula_py = formula_py.replace('^', '**')
 
240
 
241
- # Clean up any remaining issues
242
- formula_py = formula_py.replace('--', '+')
 
 
 
 
 
 
243
 
244
- # Evaluate
245
- result = eval(formula_py, {"__builtins__": {"min": min, "max": max, "sum": sum}}, {})
246
  return result
247
 
248
  except Exception as e:
249
- raise Exception(f"Error evaluating formula '{formula}': {str(e)}")
250
 
251
  def process_files(self, files) -> Tuple[str, str, str]:
252
- """Main processing function for Gradio interface"""
253
  try:
254
  if not files:
255
  return "❌ No files uploaded", "", ""
@@ -265,22 +318,20 @@ class PropertyFormulaAnalyzer:
265
  # Reset computed values
266
  self.computed_values = {}
267
 
268
- # Multiple passes to handle dependencies
269
- max_iterations = 5
270
  computable_formulas = {}
271
  non_computable_formulas = {}
272
 
273
  for iteration in range(max_iterations):
274
  newly_computed = 0
275
 
276
- for cell_ref, formula_info in self.formulas.items():
277
  # Skip if already computed
278
- if cell_ref in computable_formulas:
279
  continue
280
 
281
  formula = formula_info['formula']
282
-
283
- # Combine extracted data with computed values for checking
284
  all_data = {**extracted_data, **self.computed_values}
285
 
286
  is_computable, missing_vars = self.check_formula_computable(formula, all_data)
@@ -289,8 +340,7 @@ class PropertyFormulaAnalyzer:
289
  try:
290
  result = self.safe_eval_formula(formula, all_data)
291
 
292
- # Store result
293
- computable_formulas[cell_ref] = {
294
  'description': formula_info['description'],
295
  'formula': formula,
296
  'result': result,
@@ -298,35 +348,32 @@ class PropertyFormulaAnalyzer:
298
  'iteration': iteration + 1
299
  }
300
 
301
- # Add to computed values for cascading
302
- self.computed_values[cell_ref] = result
303
- self.computed_values[self.normalize_cell_ref(cell_ref)] = result
304
-
305
  newly_computed += 1
306
 
307
  except Exception as e:
308
- non_computable_formulas[cell_ref] = {
309
  'description': formula_info['description'],
310
  'formula': formula,
311
  'error': str(e),
312
  'missing_variables': []
313
  }
314
  else:
315
- non_computable_formulas[cell_ref] = {
316
  'description': formula_info['description'],
317
  'formula': formula,
318
  'missing_variables': missing_vars
319
  }
320
 
321
- print(f"Iteration {iteration + 1}: Computed {newly_computed} new formulas")
322
 
323
- # If no new formulas computed, stop
324
  if newly_computed == 0:
325
  break
326
 
327
- # Remove successfully computed formulas from non-computable list
328
- for cell_ref in computable_formulas.keys():
329
- non_computable_formulas.pop(cell_ref, None)
330
 
331
  # Create summary
332
  summary = f"""
@@ -338,52 +385,64 @@ class PropertyFormulaAnalyzer:
338
  **πŸ“„ Files Processed:** {len(file_paths)}
339
  **πŸ”’ Data Points Extracted:** {len(extracted_data)}
340
  **πŸ”„ Computation Iterations:** {iteration + 1}
 
341
  """
342
 
343
- # Create extracted data display
344
  data_display = "## πŸ“₯ Extracted Property Data\n\n"
345
  data_display += "| Variable | Value |\n|----------|-------|\n"
346
  for key, value in sorted(extracted_data.items()):
347
  if isinstance(value, float):
348
- data_display += f"| {key} | {value:,.2f} |\n"
349
  else:
350
  data_display += f"| {key} | {value} |\n"
351
 
352
- # Create results display
353
  results_display = "## βœ… Computed Formulas\n\n"
354
- for cell_ref, info in sorted(computable_formulas.items()):
355
- results_display += f"### {cell_ref}: {info['description']}\n"
356
- results_display += f"**Formula:** `{info['formula']}`\n"
357
- results_display += f"**Result:** {info['formatted_result']}\n"
358
- results_display += f"*Computed in iteration {info['iteration']}*\n\n"
 
 
 
 
 
 
 
 
 
 
359
 
360
  # if non_computable_formulas:
361
- # results_display += "\n## ❌ Non-Computable Formulas\n\n"
362
- # # Show only first 20 to avoid overwhelming output
363
- # for idx, (cell_ref, info) in enumerate(sorted(non_computable_formulas.items())):
364
- # if idx >= 20:
365
- # results_display += f"\n*... and {len(non_computable_formulas) - 20} more non-computable formulas*\n"
 
 
366
  # break
367
- # results_display += f"### {cell_ref}: {info['description']}\n"
368
- # results_display += f"**Formula:** `{info['formula']}`\n"
369
  # if info.get('missing_variables'):
370
- # results_display += f"**Missing Variables:** {', '.join(info['missing_variables'][:5])}\n"
371
- # if info.get('error'):
372
- # results_display += f"**Error:** {info['error']}\n"
373
  # results_display += "\n"
374
 
375
- # Create JSON output
376
  json_output = {
377
  'summary': {
378
  'total_formulas': len(self.formulas),
379
  'computable': len(computable_formulas),
380
  'non_computable': len(non_computable_formulas),
381
  'files_processed': len(file_paths),
382
- 'iterations': iteration + 1
 
383
  },
384
  'extracted_data': extracted_data,
385
  'computable_formulas': computable_formulas,
386
- 'non_computable_formulas': non_computable_formulas
387
  }
388
 
389
  json_str = json.dumps(json_output, indent=2)
@@ -394,16 +453,21 @@ class PropertyFormulaAnalyzer:
394
  error_msg = f"❌ Error processing files:\n{str(e)}\n\n{traceback.format_exc()}"
395
  return error_msg, "", ""
396
 
397
- # Initialize the analyzer
398
- analyzer = PropertyFormulaAnalyzer("formulas.txt")
399
 
400
  # Create Gradio interface
401
  with gr.Blocks(title="Property Formula Analyzer", theme=gr.themes.Soft()) as app:
402
  gr.Markdown("""
403
- # 🏒 Property Formula Analyzer
 
 
404
 
405
- Upload property documents (PDF or TXT) to automatically extract data and compute real estate formulas.
406
- The system uses iterative computation to handle formula dependencies.
 
 
 
407
  """)
408
 
409
  with gr.Row():
@@ -419,29 +483,26 @@ with gr.Blocks(title="Property Formula Analyzer", theme=gr.themes.Soft()) as app
419
 
420
  gr.Markdown("""
421
  ### πŸ“‹ Instructions:
422
- 1. Upload one or more property documents (PDF or TXT format)
423
  2. Click "Analyze & Compute Formulas"
424
- 3. Review the extracted data and computed formulas
425
- 4. Download the JSON results for further analysis
426
 
427
- **Note:** The system performs multiple computation passes to handle formula dependencies.
428
  """)
429
 
430
  with gr.Row():
431
- with gr.Column():
432
- summary_output = gr.Markdown(label="Summary")
433
 
434
  with gr.Row():
435
- with gr.Column():
436
- results_output = gr.Markdown(label="Results")
437
 
438
  with gr.Row():
439
- with gr.Column():
440
- json_output = gr.Code(
441
- label="πŸ“₯ Download Results (JSON)",
442
- language="json",
443
- lines=20
444
- )
445
 
446
  analyze_btn.click(
447
  fn=analyzer.process_files,
@@ -451,11 +512,10 @@ with gr.Blocks(title="Property Formula Analyzer", theme=gr.themes.Soft()) as app
451
 
452
  gr.Markdown("""
453
  ---
454
- ### πŸ“ Notes:
455
- - The system automatically extracts property metrics from your documents
456
- - Formulas are computed iteratively to handle dependencies between formulas
457
- - Non-computable formulas are listed with their missing variables
458
- - All results can be downloaded as JSON for further processing
459
  """)
460
 
461
  if __name__ == "__main__":
 
2
  import PyPDF2
3
  import re
4
  import json
 
5
  from typing import Dict, List, Tuple, Any
6
  import traceback
7
 
8
+ class SemanticFormulaAnalyzer:
9
  def __init__(self, formula_file_path: str = "formulas.txt"):
10
+ """Initialize the analyzer with the semantic formula file"""
11
  self.formula_file_path = formula_file_path
12
  self.formulas = {}
13
+ self.computed_values = {}
14
  self.load_formulas()
15
 
16
  def load_formulas(self):
17
+ """Load semantic formulas from file"""
18
  try:
19
  with open(self.formula_file_path, 'r', encoding='utf-8') as f:
20
  content = f.read()
21
 
22
+ # Parse semantic formulas: Variable_Name = formula
23
+ # Pattern: capture variable name, formula, and description
24
+ lines = content.split('\n')
25
 
26
+ current_formula_name = None
27
+ current_formula = None
28
+ current_description = None
29
+
30
+ for line in lines:
31
+ line = line.strip()
32
+
33
+ # Skip empty lines and section headers
34
+ if not line or line.startswith('#'):
35
+ continue
36
+
37
+ # Check if line contains a formula assignment
38
+ if '=' in line and not line.startswith('#'):
39
+ # Save previous formula if exists
40
+ if current_formula_name and current_formula:
41
+ self.formulas[current_formula_name] = {
42
+ 'formula': current_formula,
43
+ 'description': current_description or current_formula_name
44
+ }
45
+
46
+ # Parse new formula
47
+ parts = line.split('=', 1)
48
+ current_formula_name = parts[0].strip()
49
+ current_formula = parts[1].strip()
50
+ current_description = None
51
 
52
+ # Check if line is a description comment
53
+ elif line.startswith('# Description:'):
54
+ current_description = line.replace('# Description:', '').strip()
55
+
56
+ # Add last formula
57
+ if current_formula_name and current_formula:
58
+ self.formulas[current_formula_name] = {
59
+ 'formula': current_formula,
60
+ 'description': current_description or current_formula_name
61
  }
62
 
63
+ print(f"βœ… Loaded {len(self.formulas)} semantic formulas")
64
 
65
  except Exception as e:
66
+ print(f"❌ Error loading formulas: {str(e)}")
67
  traceback.print_exc()
68
 
69
  def extract_text_from_pdf(self, file_path: str) -> str:
 
89
  return ""
90
 
91
  def extract_data_from_files(self, files: List[str]) -> Dict[str, Any]:
92
+ """Extract data with semantic variable names"""
93
  combined_text = ""
94
 
95
  for file_path in files:
 
100
 
101
  extracted_data = {}
102
 
103
+ # Comprehensive extraction patterns with semantic names
104
  patterns = {
105
+ # Basic Property Info
106
+ 'UNITS': [r'(?:Total\s+)?Units?\s*:?\s*(\d+)', r'(\d+)\s*units?'],
107
+ 'GROSS_SF': [r'Building\s+(?:Size|SF)\s*:?\s*([\d,]+)', r'Building\s+SF\s*(\d+)', r'(\d+)\s*SF'],
108
+ 'BUILDING_SF': [r'Building\s+(?:Size|SF)\s*:?\s*([\d,]+)'],
109
+ 'RENTABLE_SF': [r'Rentable\s+SF\s*:?\s*([\d,]+)', r'RSF\s*:?\s*([\d,]+)'],
110
+ 'LOT_ACRES': [r'Lot\s+Size\s*:?\s*([\d.]+)\s*(?:acres?|Acres?)'],
111
+ 'LOT_SF': [r'Lot\s+(?:Size\s+)?SF\s*:?\s*([\d,]+)'],
112
+
113
+ # Financial - Core
114
+ 'PRICE': [r'(?:Asking\s+)?Price\s*:?\s*\$\s*([\d,]+)', r'Purchase\s+Price\s*:?\s*\$\s*([\d,]+)'],
115
+ 'NOI': [r'Net\s+Operating\s+Income\s*(?:\(NOI\))?\s*:?\s*\$?\s*([\d,]+)'],
116
+ 'NET_OPERATING_INCOME': [r'Net\s+Operating\s+Income\s*(?:\(NOI\))?\s*:?\s*\$?\s*([\d,]+)'],
117
+ 'EGI': [r'Effective\s+Gross\s+Income\s*:?\s*\$?\s*([\d,]+)'],
118
+ 'EFFECTIVE_GROSS_INCOME': [r'Effective\s+Gross\s+Income\s*:?\s*\$?\s*([\d,]+)'],
119
+ 'GPR': [r'Gross\s+Potential\s+Rent\s*(?:\(Annual\))?\s*:?\s*\$?\s*([\d,]+)'],
120
+ 'GROSS_POTENTIAL_RENT': [r'Gross\s+Potential\s+Rent\s*:?\s*\$?\s*([\d,]+)'],
121
  'VACANCY': [r'Vacancy\s*(?:\([\d.]+%\))?\s*:?\s*-?\$?\s*([\d,]+)'],
122
+ 'VACANCY_LOSS': [r'Vacancy\s*(?:\([\d.]+%\))?\s*:?\s*-?\$?\s*([\d,]+)'],
123
+ 'VACANCY_RATE': [r'Vacancy\s*(?:\()?([\d.]+)%'],
124
+
125
+ # Operating Expenses
126
+ 'OPEX': [r'Operating\s+Expenses\s*:?\s*\$?\s*([\d,]+)', r'Total\s+Operating\s+Expenses\s*=?\s*\$?\s*([\d,]+)'],
127
+ 'TOTAL_OPERATING_EXPENSES': [r'Total\s+Operating\s+Expenses\s*=?\s*\$?\s*([\d,]+)'],
128
  'PROPERTY_TAXES': [r'Property\s+Taxes\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
129
+ 'REAL_ESTATE_TAXES': [r'Property\s+Taxes\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
130
  'INSURANCE': [r'Insurance\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
131
  'UTILITIES': [r'Utilities\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
132
+ 'REPAIRS_AND_MAINTENANCE': [r'Repairs?\s*(?:&|and)?\s*Maintenance\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
133
  'PAYROLL': [r'Payroll\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
134
  'ADMINISTRATIVE': [r'Administrative\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
135
  'MARKETING': [r'Marketing\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
136
+ 'ADVERTISING_AND_MARKETING_COST': [r'Marketing\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
137
  'REPLACEMENT_RESERVES': [r'Replacement\s+Reserves\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
138
  'MANAGEMENT_FEE': [r'Management\s*(?:\([^)]+\))?\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
139
+ 'MANAGEMENT_FEE_PERCENTAGE': [r'Management\s*.*?(\d+)%', r'Management\s*@\s*([\d.]+)%'],
140
+ 'PROFESSIONAL_FEES': [r'Professional\s+Fees\s*:?\s*\$?\s*([\d,]+\.?\d*)'],
141
+
142
+ # Rates and Percentages
143
+ 'CAP_RATE': [r'Cap\s+Rate\s*:?\s*([\d.]+)%?'],
144
  'INTEREST_RATE': [r'Interest\s+Rate\s*:?\s*([\d.]+)%?'],
145
+ 'INTEREST_RATE_DECIMAL': [r'Interest\s+Rate\s*:?\s*([\d.]+)%?'],
146
  'LTC': [r'Loan[- ]to[- ]Cost\s*(?:\(LTC\))?\s*:?\s*([\d.]+)%?'],
147
+ 'LTC_RATIO': [r'Loan[- ]to[- ]Cost\s*(?:\(LTC\))?\s*:?\s*([\d.]+)%?'],
148
  'EXIT_CAP_RATE': [r'Exit\s+Cap\s+Rate\s*:?\s*([\d.]+)%?'],
149
+ 'EXIT_CAP_RATE_DECIMAL': [r'Exit\s+Cap\s+Rate\s*:?\s*([\d.]+)%?'],
150
+
151
+ # Demographics
152
  'MEDIAN_INCOME': [r'Median\s+(?:HH\s+)?Income\s*:?\s*\$?\s*([\d,]+)'],
153
  'POPULATION': [r'Population\s*:?\s*([\d,]+)'],
154
  'HOUSEHOLDS': [r'Households\s*:?\s*([\d,]+)'],
155
  'RENTER_OCCUPIED_PCT': [r'Renter[- ]Occupied\s*:?\s*([\d.]+)%?'],
156
+
157
+ # Construction & Development
158
+ 'CONSTRUCTION_COST_PER_GSF': [r'Construction\s+Cost\s*:?\s*\$?\s*([\d,]+)\s*per\s+(?:GSF|SF)'],
159
+ 'TOTAL_CONSTRUCTION_GMP': [r'(?:Total\s+)?Construction\s+GMP\s*:?\s*\$?\s*([\d,]+)'],
160
+ 'SOFT_COSTS': [r'(?:Total\s+)?Soft\s+Costs?\s*:?\s*\$?\s*([\d,]+)'],
161
+ 'TOTAL_SOFT_COST': [r'(?:Total\s+)?Soft\s+Costs?\s*:?\s*\$?\s*([\d,]+)'],
162
+ 'CONTINGENCY': [r'Contingency\s*:?\s*\$?\s*([\d,]+)'],
163
+ 'CONTINGENCY_COST': [r'Contingency\s*:?\s*\$?\s*([\d,]+)'],
164
+ 'DEV_FEE': [r'Dev(?:elopment)?\s+Fee\s*:?\s*\$?\s*([\d,]+)'],
165
+ 'DEVELOPMENT_FEE': [r'Dev(?:elopment)?\s+Fee\s*:?\s*\$?\s*([\d,]+)'],
166
+
167
+ # Land & Acquisition
168
+ 'LAND_VALUE': [r'(?:Total\s+)?Land\s+Value\s*:?\s*\$?\s*([\d,]+)'],
169
+ 'CLOSING_COSTS': [r'Closing\s+Costs\s*:?\s*\$?\s*([\d,]+)'],
170
+ 'ACQUISITION_FEE': [r'Acq(?:uisition)?\s+Fee\s*:?\s*\$?\s*([\d,]+)'],
171
  }
172
 
173
  for key, pattern_list in patterns.items():
 
182
  except (ValueError, IndexError):
183
  continue
184
 
185
+ # Post-processing: Handle percentages and derived values
186
+ if 'INTEREST_RATE' in extracted_data:
187
+ if extracted_data['INTEREST_RATE'] > 1:
188
+ extracted_data['INTEREST_RATE'] = extracted_data['INTEREST_RATE'] / 100
189
+ extracted_data['INTEREST_RATE_DECIMAL'] = extracted_data['INTEREST_RATE']
190
 
191
+ if 'LTC' in extracted_data:
192
+ if extracted_data['LTC'] > 1:
193
+ extracted_data['LTC'] = extracted_data['LTC'] / 100
194
+ extracted_data['LTC_RATIO'] = extracted_data['LTC']
195
 
196
+ if 'CAP_RATE' in extracted_data and extracted_data['CAP_RATE'] < 1:
197
+ extracted_data['CAP_RATE'] = extracted_data['CAP_RATE'] * 100
198
 
199
+ if 'EXIT_CAP_RATE' in extracted_data:
200
+ if extracted_data['EXIT_CAP_RATE'] > 1:
201
+ extracted_data['EXIT_CAP_RATE_DECIMAL'] = extracted_data['EXIT_CAP_RATE'] / 100
202
+ else:
203
+ extracted_data['EXIT_CAP_RATE_DECIMAL'] = extracted_data['EXIT_CAP_RATE']
204
 
205
+ if 'VACANCY_RATE' in extracted_data and extracted_data['VACANCY_RATE'] > 1:
206
+ extracted_data['VACANCY_RATE'] = extracted_data['VACANCY_RATE'] / 100
 
207
 
208
+ # Map synonyms
209
+ if 'BUILDING_SF' in extracted_data and 'GROSS_SF' not in extracted_data:
210
+ extracted_data['GROSS_SF'] = extracted_data['BUILDING_SF']
211
 
212
+ if 'GROSS_SF' in extracted_data and 'BUILDING_SF' not in extracted_data:
213
+ extracted_data['BUILDING_SF'] = extracted_data['GROSS_SF']
214
 
215
+ # Estimate RENTABLE_SF if not provided (assume 90% efficiency)
216
+ if 'GROSS_SF' in extracted_data and 'RENTABLE_SF' not in extracted_data:
217
+ extracted_data['RENTABLE_SF'] = extracted_data['GROSS_SF'] * 0.9
218
 
219
+ # Map EGI synonyms
220
+ if 'EGI' in extracted_data and 'EFFECTIVE_GROSS_INCOME' not in extracted_data:
221
+ extracted_data['EFFECTIVE_GROSS_INCOME'] = extracted_data['EGI']
222
 
223
+ if 'EFFECTIVE_GROSS_INCOME' in extracted_data and 'EGI' not in extracted_data:
224
+ extracted_data['EGI'] = extracted_data['EFFECTIVE_GROSS_INCOME']
 
 
 
 
 
 
 
 
 
225
 
226
+ # Map NOI synonyms
227
+ if 'NOI' in extracted_data and 'NET_OPERATING_INCOME' not in extracted_data:
228
+ extracted_data['NET_OPERATING_INCOME'] = extracted_data['NOI']
 
229
 
230
+ if 'NET_OPERATING_INCOME' in extracted_data and 'NOI' not in extracted_data:
231
+ extracted_data['NOI'] = extracted_data['NET_OPERATING_INCOME']
 
232
 
233
+ # Map OPEX synonyms
234
+ if 'OPEX' in extracted_data and 'TOTAL_OPERATING_EXPENSES' not in extracted_data:
235
+ extracted_data['TOTAL_OPERATING_EXPENSES'] = extracted_data['OPEX']
236
 
237
+ if 'TOTAL_OPERATING_EXPENSES' in extracted_data and 'OPEX' not in extracted_data:
238
+ extracted_data['OPEX'] = extracted_data['TOTAL_OPERATING_EXPENSES']
239
+
240
+ # Derive management fee percentage if we have the dollar amount
241
+ if 'MANAGEMENT_FEE' in extracted_data and 'EFFECTIVE_GROSS_INCOME' in extracted_data and 'MANAGEMENT_FEE_PERCENTAGE' not in extracted_data:
242
+ extracted_data['MANAGEMENT_FEE_PERCENTAGE'] = extracted_data['MANAGEMENT_FEE'] / extracted_data['EFFECTIVE_GROSS_INCOME']
243
+
244
+ return extracted_data
245
 
246
  def extract_variables_from_formula(self, formula: str) -> List[str]:
247
+ """Extract variable names from formula"""
248
+ # Match Python-style variable names (letters, numbers, underscores)
249
+ # But exclude Python keywords and operators
250
+ var_pattern = r'\b([A-Z][A-Z0-9_]*)\b'
251
+ variables = re.findall(var_pattern, formula)
252
 
253
+ # Remove Python built-in functions
254
+ python_builtins = {'SUM', 'MIN', 'MAX', 'ABS', 'ROUND'}
255
+ variables = [v for v in variables if v not in python_builtins]
256
 
257
  return list(set(variables))
258
 
259
  def check_formula_computable(self, formula: str, data: Dict[str, Any]) -> Tuple[bool, List[str]]:
260
+ """Check if formula can be computed"""
261
  variables = self.extract_variables_from_formula(formula)
262
  missing = []
263
 
264
  for var in variables:
265
+ if var not in data and var not in self.computed_values:
266
  missing.append(var)
267
 
268
  return len(missing) == 0, missing
269
 
270
  def safe_eval_formula(self, formula: str, data: Dict[str, Any]) -> Any:
271
+ """Safely evaluate a semantic formula"""
272
  try:
273
+ # Combine extracted data with computed values
274
+ all_data = {**data, **self.computed_values}
275
+
276
+ # Replace variables with their values
277
+ formula_eval = formula
278
+ variables = self.extract_variables_from_formula(formula)
279
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  for var in sorted(variables, key=len, reverse=True):
281
+ if var in all_data:
282
+ value = all_data[var]
283
+ formula_eval = re.sub(r'\b' + var + r'\b', str(value), formula_eval)
284
 
285
+ # Replace ** with ** (already correct for Python)
286
+ # Handle any remaining math operations
287
+ formula_eval = formula_eval.replace('^', '**')
288
 
289
+ # Evaluate safely
290
+ safe_dict = {
291
+ 'min': min,
292
+ 'max': max,
293
+ 'sum': sum,
294
+ 'abs': abs,
295
+ 'round': round
296
+ }
297
 
298
+ result = eval(formula_eval, {"__builtins__": safe_dict}, {})
 
299
  return result
300
 
301
  except Exception as e:
302
+ raise Exception(f"Evaluation error: {str(e)}")
303
 
304
  def process_files(self, files) -> Tuple[str, str, str]:
305
+ """Main processing function"""
306
  try:
307
  if not files:
308
  return "❌ No files uploaded", "", ""
 
318
  # Reset computed values
319
  self.computed_values = {}
320
 
321
+ # Multiple passes for dependency resolution
322
+ max_iterations = 10
323
  computable_formulas = {}
324
  non_computable_formulas = {}
325
 
326
  for iteration in range(max_iterations):
327
  newly_computed = 0
328
 
329
+ for formula_name, formula_info in self.formulas.items():
330
  # Skip if already computed
331
+ if formula_name in computable_formulas:
332
  continue
333
 
334
  formula = formula_info['formula']
 
 
335
  all_data = {**extracted_data, **self.computed_values}
336
 
337
  is_computable, missing_vars = self.check_formula_computable(formula, all_data)
 
340
  try:
341
  result = self.safe_eval_formula(formula, all_data)
342
 
343
+ computable_formulas[formula_name] = {
 
344
  'description': formula_info['description'],
345
  'formula': formula,
346
  'result': result,
 
348
  'iteration': iteration + 1
349
  }
350
 
351
+ # Store for cascading
352
+ self.computed_values[formula_name] = result
 
 
353
  newly_computed += 1
354
 
355
  except Exception as e:
356
+ non_computable_formulas[formula_name] = {
357
  'description': formula_info['description'],
358
  'formula': formula,
359
  'error': str(e),
360
  'missing_variables': []
361
  }
362
  else:
363
+ non_computable_formulas[formula_name] = {
364
  'description': formula_info['description'],
365
  'formula': formula,
366
  'missing_variables': missing_vars
367
  }
368
 
369
+ print(f"πŸ“Š Iteration {iteration + 1}: Computed {newly_computed} new formulas (Total: {len(computable_formulas)})")
370
 
 
371
  if newly_computed == 0:
372
  break
373
 
374
+ # Remove computed formulas from non-computable list
375
+ for formula_name in computable_formulas.keys():
376
+ non_computable_formulas.pop(formula_name, None)
377
 
378
  # Create summary
379
  summary = f"""
 
385
  **πŸ“„ Files Processed:** {len(file_paths)}
386
  **πŸ”’ Data Points Extracted:** {len(extracted_data)}
387
  **πŸ”„ Computation Iterations:** {iteration + 1}
388
+ **πŸ“ˆ Success Rate:** {(len(computable_formulas) / len(self.formulas) * 100):.1f}%
389
  """
390
 
391
+ # Extracted data display
392
  data_display = "## πŸ“₯ Extracted Property Data\n\n"
393
  data_display += "| Variable | Value |\n|----------|-------|\n"
394
  for key, value in sorted(extracted_data.items()):
395
  if isinstance(value, float):
396
+ data_display += f"| {key} | {value:,.4f} |\n"
397
  else:
398
  data_display += f"| {key} | {value} |\n"
399
 
400
+ # Results display
401
  results_display = "## βœ… Computed Formulas\n\n"
402
+
403
+ # Group by iteration
404
+ by_iteration = {}
405
+ for name, info in computable_formulas.items():
406
+ iter_num = info['iteration']
407
+ if iter_num not in by_iteration:
408
+ by_iteration[iter_num] = []
409
+ by_iteration[iter_num].append((name, info))
410
+
411
+ for iter_num in sorted(by_iteration.keys()):
412
+ results_display += f"### Iteration {iter_num} ({len(by_iteration[iter_num])} formulas)\n\n"
413
+ for name, info in sorted(by_iteration[iter_num]):
414
+ results_display += f"**{name}** = {info['formatted_result']}\n"
415
+ results_display += f"*{info['description']}*\n"
416
+ results_display += f"`{info['formula']}`\n\n"
417
 
418
  # if non_computable_formulas:
419
+ # results_display += f"\n## ❌ Non-Computable Formulas ({len(non_computable_formulas)})\n\n"
420
+ # # Show sample of non-computable
421
+ # sample_size = min(15, len(non_computable_formulas))
422
+ # results_display += f"*Showing {sample_size} of {len(non_computable_formulas)} non-computable formulas*\n\n"
423
+
424
+ # for idx, (name, info) in enumerate(sorted(non_computable_formulas.items())):
425
+ # if idx >= sample_size:
426
  # break
427
+ # results_display += f"**{name}**: {info['description']}\n"
 
428
  # if info.get('missing_variables'):
429
+ # missing = info['missing_variables'][:5]
430
+ # results_display += f"Missing: {', '.join(missing)}\n"
 
431
  # results_display += "\n"
432
 
433
+ # JSON output
434
  json_output = {
435
  'summary': {
436
  'total_formulas': len(self.formulas),
437
  'computable': len(computable_formulas),
438
  'non_computable': len(non_computable_formulas),
439
  'files_processed': len(file_paths),
440
+ 'iterations': iteration + 1,
441
+ 'success_rate': round(len(computable_formulas) / len(self.formulas) * 100, 2)
442
  },
443
  'extracted_data': extracted_data,
444
  'computable_formulas': computable_formulas,
445
+ 'non_computable_formulas': {k: v for k, v in list(non_computable_formulas.items())[:20]}
446
  }
447
 
448
  json_str = json.dumps(json_output, indent=2)
 
453
  error_msg = f"❌ Error processing files:\n{str(e)}\n\n{traceback.format_exc()}"
454
  return error_msg, "", ""
455
 
456
+ # Initialize analyzer
457
+ analyzer = SemanticFormulaAnalyzer("formulas.txt")
458
 
459
  # Create Gradio interface
460
  with gr.Blocks(title="Property Formula Analyzer", theme=gr.themes.Soft()) as app:
461
  gr.Markdown("""
462
+ # 🏒 Property Formula Analyzer - Semantic Edition
463
+
464
+ Upload property documents to extract data and compute real estate formulas using **semantic variable names**.
465
 
466
+ ### Features:
467
+ - πŸ“„ Extracts data from PDFs and text files
468
+ - πŸ”’ Matches property metrics to formula variables
469
+ - πŸ”„ Multi-pass computation for dependent formulas
470
+ - πŸ“Š Clear, human-readable formula names
471
  """)
472
 
473
  with gr.Row():
 
483
 
484
  gr.Markdown("""
485
  ### πŸ“‹ Instructions:
486
+ 1. Upload property documents (Offering Memorandum, Operating Expenses, etc.)
487
  2. Click "Analyze & Compute Formulas"
488
+ 3. Review extracted data and computed metrics
489
+ 4. Download JSON results
490
 
491
+ **Example Variables**: `UNITS`, `PRICE`, `NOI`, `GROSS_SF`, `EFFECTIVE_GROSS_INCOME`
492
  """)
493
 
494
  with gr.Row():
495
+ summary_output = gr.Markdown(label="Summary")
 
496
 
497
  with gr.Row():
498
+ results_output = gr.Markdown(label="Results")
 
499
 
500
  with gr.Row():
501
+ json_output = gr.Code(
502
+ label="πŸ“₯ JSON Results",
503
+ language="json",
504
+ lines=20
505
+ )
 
506
 
507
  analyze_btn.click(
508
  fn=analyzer.process_files,
 
512
 
513
  gr.Markdown("""
514
  ---
515
+ ### πŸ’‘ Tips:
516
+ - The system uses semantic variable names (e.g., `Building_Efficiency` instead of `E1`)
517
+ - Formulas cascade: computed values enable more formulas in subsequent iterations
518
+ - Non-computable formulas show which variables are missing
 
519
  """)
520
 
521
  if __name__ == "__main__":