mlbench123 commited on
Commit
992c406
·
verified ·
1 Parent(s): 40aff38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +464 -2
app.py CHANGED
@@ -16,8 +16,6 @@ Extracts data from PDFs, solves formulas with Gemini API, generates Excel
16
  """
17
  from fastapi.middleware.cors import CORSMiddleware
18
 
19
-
20
-
21
  import re
22
  import json
23
  from pathlib import Path
@@ -28,6 +26,12 @@ from openpyxl.utils import get_column_letter
28
  from pdfminer.high_level import extract_text
29
  import google.generativeai as genai
30
 
 
 
 
 
 
 
31
  class RealEstateModelPipeline:
32
  def __init__(self, gemini_api_key: str):
33
  """Initialize pipeline with Gemini API key"""
@@ -1995,6 +1999,464 @@ async def analyze_only(files: List[UploadFile] = File(...)):
1995
  except Exception as e:
1996
  raise HTTPException(status_code=500, detail=str(e))
1997
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1998
 
1999
  def process_pdfs(pdf_files):
2000
  """Process uploaded PDFs and return Excel file"""
 
16
  """
17
  from fastapi.middleware.cors import CORSMiddleware
18
 
 
 
19
  import re
20
  import json
21
  from pathlib import Path
 
26
  from pdfminer.high_level import extract_text
27
  import google.generativeai as genai
28
 
29
+ # Add logging configuration
30
+ import logging
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
  class RealEstateModelPipeline:
36
  def __init__(self, gemini_api_key: str):
37
  """Initialize pipeline with Gemini API key"""
 
1999
  except Exception as e:
2000
  raise HTTPException(status_code=500, detail=str(e))
2001
 
2002
+ @app.post("/api/analyze-documents")
2003
+ async def analyze_documents(
2004
+ files: List[UploadFile] = File(...),
2005
+ max_pages_per_doc: int = 2,
2006
+ confidence_threshold: float = 0.7
2007
+ ):
2008
+ """
2009
+ Industrial-scale document relevance analysis endpoint
2010
+
2011
+ Analyzes uploaded documents to determine if they are relevant to real estate
2012
+ and metrics calculation. Uses first few pages for efficiency.
2013
+
2014
+ Parameters:
2015
+ - files: List of document files (PDF, XLSX, DOCX, etc.)
2016
+ - max_pages_per_doc: Maximum number of pages to analyze per document (default: 2)
2017
+ - confidence_threshold: Minimum confidence score to mark as relevant (default: 0.7)
2018
+
2019
+ Returns:
2020
+ - JSON with relevance analysis for each file
2021
+ """
2022
+ if not files:
2023
+ raise HTTPException(status_code=400, detail="No files uploaded")
2024
+
2025
+ # Validate input parameters
2026
+ if max_pages_per_doc < 1 or max_pages_per_doc > 10:
2027
+ raise HTTPException(status_code=400, detail="max_pages_per_doc must be between 1 and 10")
2028
+
2029
+ if confidence_threshold < 0.1 or confidence_threshold > 1.0:
2030
+ raise HTTPException(status_code=400, detail="confidence_threshold must be between 0.1 and 1.0")
2031
+
2032
+ temp_dir = None
2033
+ try:
2034
+ # Create temporary directory with unique name
2035
+ temp_dir = tempfile.mkdtemp(prefix="doc_analysis_")
2036
+ logger.info(f"Created temp directory: {temp_dir}")
2037
+
2038
+ # Process files in parallel for better performance
2039
+ analysis_results = await process_documents_parallel(
2040
+ files, temp_dir, max_pages_per_doc, confidence_threshold
2041
+ )
2042
+
2043
+ # Generate overall summary
2044
+ summary = generate_analysis_summary(analysis_results)
2045
+
2046
+ response = {
2047
+ "status": "success",
2048
+ "summary": summary,
2049
+ "analysis": analysis_results,
2050
+ "metadata": {
2051
+ "total_files": len(files),
2052
+ "relevant_files": summary["relevant_count"],
2053
+ "non_relevant_files": summary["non_relevant_count"],
2054
+ "confidence_threshold": confidence_threshold,
2055
+ "max_pages_analyzed": max_pages_per_doc,
2056
+ "processing_time_seconds": summary["processing_time_seconds"]
2057
+ }
2058
+ }
2059
+
2060
+ logger.info(f"Document analysis completed: {summary['relevant_count']}/{len(files)} relevant files")
2061
+ return JSONResponse(content=response)
2062
+
2063
+ except Exception as e:
2064
+ logger.error(f"Document analysis error: {str(e)}", exc_info=True)
2065
+ raise HTTPException(
2066
+ status_code=500,
2067
+ detail=f"Document analysis failed: {str(e)}"
2068
+ )
2069
+ finally:
2070
+ # Cleanup temporary directory
2071
+ if temp_dir and os.path.exists(temp_dir):
2072
+ try:
2073
+ shutil.rmtree(temp_dir)
2074
+ logger.info(f"Cleaned up temp directory: {temp_dir}")
2075
+ except Exception as e:
2076
+ logger.warning(f"Failed to cleanup temp directory: {str(e)}")
2077
+
2078
+
2079
+ async def process_documents_parallel(
2080
+ files: List[UploadFile],
2081
+ temp_dir: str,
2082
+ max_pages: int,
2083
+ confidence_threshold: float
2084
+ ) -> List[Dict]:
2085
+ """Process documents in parallel for better performance"""
2086
+ import asyncio
2087
+
2088
+ # Save all files first
2089
+ saved_paths = []
2090
+ for upload_file in files:
2091
+ file_path = Path(temp_dir) / secure_filename(upload_file.filename)
2092
+ with open(file_path, "wb") as f:
2093
+ content = await upload_file.read()
2094
+ f.write(content)
2095
+ saved_paths.append((file_path, upload_file.filename, upload_file.content_type))
2096
+
2097
+ # Process files concurrently
2098
+ tasks = []
2099
+ for file_path, filename, content_type in saved_paths:
2100
+ task = analyze_single_document(
2101
+ file_path, filename, content_type, max_pages, confidence_threshold
2102
+ )
2103
+ tasks.append(task)
2104
+
2105
+ # Use asyncio.gather for concurrent processing
2106
+ results = await asyncio.gather(*tasks, return_exceptions=True)
2107
+
2108
+ # Handle exceptions in individual file processing
2109
+ processed_results = []
2110
+ for i, result in enumerate(results):
2111
+ filename = saved_paths[i][1]
2112
+ if isinstance(result, Exception):
2113
+ logger.error(f"Error processing {filename}: {str(result)}")
2114
+ processed_results.append({
2115
+ "filename": filename,
2116
+ "relevant": False,
2117
+ "confidence": 0.0,
2118
+ "error": str(result),
2119
+ "reason": "Processing failed",
2120
+ "key_indicators": []
2121
+ })
2122
+ else:
2123
+ processed_results.append(result)
2124
+
2125
+ return processed_results
2126
+
2127
+
2128
+ async def analyze_single_document(
2129
+ file_path: Path,
2130
+ filename: str,
2131
+ content_type: str,
2132
+ max_pages: int,
2133
+ confidence_threshold: float
2134
+ ) -> Dict:
2135
+ """Analyze a single document for real estate relevance"""
2136
+
2137
+ start_time = time.time()
2138
+
2139
+ try:
2140
+ # Extract text from document (first N pages)
2141
+ extracted_text = await extract_document_text(
2142
+ file_path, content_type, max_pages
2143
+ )
2144
+
2145
+ if not extracted_text or len(extracted_text.strip()) < 50:
2146
+ return {
2147
+ "filename": filename,
2148
+ "relevant": False,
2149
+ "confidence": 0.0,
2150
+ "reason": "Insufficient or unreadable text content",
2151
+ "key_indicators": [],
2152
+ "text_sample": extracted_text[:200] if extracted_text else ""
2153
+ }
2154
+
2155
+ # Analyze with Gemini
2156
+ analysis_result = await analyze_with_gemini(extracted_text, confidence_threshold)
2157
+
2158
+ processing_time = time.time() - start_time
2159
+
2160
+ return {
2161
+ "filename": filename,
2162
+ "relevant": analysis_result["relevant"],
2163
+ "confidence": analysis_result["confidence"],
2164
+ "reason": analysis_result["reason"],
2165
+ "key_indicators": analysis_result["key_indicators"],
2166
+ "document_type": analysis_result.get("document_type", "unknown"),
2167
+ "text_sample": extracted_text[:500], # First 500 chars for debugging
2168
+ "processing_time_seconds": round(processing_time, 2),
2169
+ "pages_analyzed": min(max_pages, estimate_page_count(file_path, content_type))
2170
+ }
2171
+
2172
+ except Exception as e:
2173
+ logger.error(f"Error analyzing {filename}: {str(e)}")
2174
+ return {
2175
+ "filename": filename,
2176
+ "relevant": False,
2177
+ "confidence": 0.0,
2178
+ "error": str(e),
2179
+ "reason": "Analysis error",
2180
+ "key_indicators": []
2181
+ }
2182
+
2183
+
2184
+ async def extract_document_text(file_path: Path, content_type: str, max_pages: int) -> str:
2185
+ """Extract text from document with page limit"""
2186
+
2187
+ file_extension = file_path.suffix.lower()
2188
+
2189
+ try:
2190
+ if file_extension == '.pdf':
2191
+ return extract_pdf_text_limited(file_path, max_pages)
2192
+ elif file_extension in ['.xlsx', '.xls']:
2193
+ return extract_excel_text_limited(file_path, max_pages)
2194
+ elif file_extension in ['.docx', '.doc']:
2195
+ return extract_docx_text_limited(file_path, max_pages)
2196
+ elif file_extension in ['.txt', '.csv']:
2197
+ return extract_text_file_limited(file_path, max_pages)
2198
+ else:
2199
+ # Fallback: try to read as text
2200
+ return extract_text_file_limited(file_path, max_pages)
2201
+
2202
+ except Exception as e:
2203
+ logger.warning(f"Text extraction failed for {file_path}: {str(e)}")
2204
+ return ""
2205
+
2206
+
2207
+ def extract_pdf_text_limited(pdf_path: Path, max_pages: int) -> str:
2208
+ """Extract text from first N pages of PDF"""
2209
+ try:
2210
+ from pdfminer.high_level import extract_text
2211
+ from pdfminer.layout import LAParams
2212
+
2213
+ # Extract only first N pages
2214
+ text = extract_text(
2215
+ str(pdf_path),
2216
+ laparams=LAParams(),
2217
+ maxpages=max_pages
2218
+ )
2219
+ return text.strip()
2220
+ except Exception as e:
2221
+ logger.error(f"PDF extraction error: {str(e)}")
2222
+ return ""
2223
+
2224
+
2225
+ def extract_excel_text_limited(excel_path: Path, max_sheets: int) -> str:
2226
+ """Extract text from first N sheets of Excel file"""
2227
+ try:
2228
+ import pandas as pd
2229
+
2230
+ extracted_content = []
2231
+ xlsx = pd.ExcelFile(excel_path)
2232
+
2233
+ # Limit number of sheets processed
2234
+ sheets_to_process = xlsx.sheet_names[:max_sheets]
2235
+
2236
+ for sheet_name in sheets_to_process:
2237
+ try:
2238
+ df = pd.read_excel(xlsx, sheet_name=sheet_name, nrows=50) # First 50 rows
2239
+ extracted_content.append(f"=== Sheet: {sheet_name} ===")
2240
+ extracted_content.append(df.to_string(index=False, max_rows=20))
2241
+ extracted_content.append("\n")
2242
+ except Exception as e:
2243
+ logger.warning(f"Could not read sheet {sheet_name}: {str(e)}")
2244
+ continue
2245
+
2246
+ return "\n".join(extracted_content)
2247
+ except Exception as e:
2248
+ logger.error(f"Excel extraction error: {str(e)}")
2249
+ return ""
2250
+
2251
+
2252
+ def extract_docx_text_limited(docx_path: Path, max_pages: int) -> str:
2253
+ """Extract text from first N pages of DOCX (estimated)"""
2254
+ try:
2255
+ import docx
2256
+
2257
+ doc = docx.Document(str(docx_path))
2258
+ full_text = []
2259
+
2260
+ # Estimate pages by paragraphs (rough approximation)
2261
+ paragraphs_processed = 0
2262
+ paragraphs_per_page = 10 # Rough estimate
2263
+
2264
+ for paragraph in doc.paragraphs:
2265
+ if paragraphs_processed >= max_pages * paragraphs_per_page:
2266
+ break
2267
+ if paragraph.text.strip():
2268
+ full_text.append(paragraph.text)
2269
+ paragraphs_processed += 1
2270
+
2271
+ return "\n".join(full_text)
2272
+ except Exception as e:
2273
+ logger.error(f"DOCX extraction error: {str(e)}")
2274
+ return ""
2275
+
2276
+
2277
+ def extract_text_file_limited(file_path: Path, max_pages: int) -> str:
2278
+ """Extract limited text from text file"""
2279
+ try:
2280
+ lines_per_page = 50
2281
+ max_lines = max_pages * lines_per_page
2282
+
2283
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
2284
+ lines = []
2285
+ for i, line in enumerate(f):
2286
+ if i >= max_lines:
2287
+ break
2288
+ lines.append(line)
2289
+
2290
+ return "".join(lines)
2291
+ except Exception as e:
2292
+ logger.error(f"Text file extraction error: {str(e)}")
2293
+ return ""
2294
+
2295
+
2296
+ def estimate_page_count(file_path: Path, content_type: str) -> int:
2297
+ """Estimate number of pages in document"""
2298
+ # Simple estimation - can be enhanced based on file type
2299
+ return 1
2300
+
2301
+
2302
+ async def analyze_with_gemini(text: str, confidence_threshold: float) -> Dict:
2303
+ """Use Gemini to analyze document relevance"""
2304
+
2305
+ prompt = f"""
2306
+ Analyze this document text and determine if it's relevant to REAL ESTATE and METRICS CALCULATION.
2307
+
2308
+ CRITICAL: You must respond with ONLY a JSON object, no other text.
2309
+
2310
+ DOCUMENT TEXT (first few pages):
2311
+ {text[:8000]} # Limit text to avoid token limits
2312
+
2313
+ ANALYSIS INSTRUCTIONS:
2314
+ 1. Determine if this document is relevant to real estate business, investments, or metrics
2315
+ 2. Identify key indicators that support your decision
2316
+ 3. Provide a confidence score (0.0 to 1.0)
2317
+ 4. Classify the document type if possible
2318
+
2319
+ RELEVANCE CRITERIA:
2320
+ - Real estate related: property listings, financial models, market analysis, offering memorandums, rent rolls, operating statements
2321
+ - Metrics calculation: financial projections, ROI analysis, cap rates, NOI calculations, cash flow analysis
2322
+ - Real estate development: construction costs, pro formas, feasibility studies
2323
+
2324
+ NON-RELEVANT EXAMPLES:
2325
+ - Resumes, personal documents, marketing brochures for non-real estate
2326
+ - Academic papers unrelated to real estate
2327
+ - General business documents without real estate focus
2328
+
2329
+ REQUIRED JSON RESPONSE FORMAT:
2330
+ {{
2331
+ "relevant": true/false,
2332
+ "confidence": 0.85,
2333
+ "reason": "Brief explanation of relevance decision",
2334
+ "key_indicators": ["indicator1", "indicator2", ...],
2335
+ "document_type": "offering_memorandum|financial_statement|market_report|rent_roll|unknown"
2336
+ }}
2337
+
2338
+ Confidence threshold for relevance: {confidence_threshold}
2339
+ """
2340
+
2341
+ try:
2342
+ # Initialize Gemini
2343
+ genai.configure(api_key=GEMINI_API_KEY)
2344
+ model = genai.GenerativeModel('gemini-2.0-flash')
2345
+
2346
+ response = await asyncio.get_event_loop().run_in_executor(
2347
+ None,
2348
+ lambda: model.generate_content(prompt)
2349
+ )
2350
+
2351
+ response_text = response.text.strip()
2352
+
2353
+ # Clean JSON response
2354
+ if "```json" in response_text:
2355
+ response_text = response_text.split("```json")[1].split("```")[0].strip()
2356
+ elif "```" in response_text:
2357
+ response_text = response_text.split("```")[1].split("```")[0].strip()
2358
+
2359
+ result = json.loads(response_text)
2360
+
2361
+ # Validate response structure
2362
+ required_fields = ["relevant", "confidence", "reason", "key_indicators"]
2363
+ for field in required_fields:
2364
+ if field not in result:
2365
+ raise ValueError(f"Missing field in Gemini response: {field}")
2366
+
2367
+ # Apply confidence threshold
2368
+ if result["confidence"] < confidence_threshold:
2369
+ result["relevant"] = False
2370
+ result["reason"] = f"Confidence ({result['confidence']}) below threshold ({confidence_threshold})"
2371
+
2372
+ return result
2373
+
2374
+ except Exception as e:
2375
+ logger.error(f"Gemini analysis failed: {str(e)}")
2376
+ # Fallback: simple keyword-based analysis
2377
+ return perform_fallback_analysis(text, confidence_threshold)
2378
+
2379
+
2380
+ def perform_fallback_analysis(text: str, confidence_threshold: float) -> Dict:
2381
+ """Fallback analysis using keyword matching when Gemini fails"""
2382
+
2383
+ real_estate_keywords = [
2384
+ 'real estate', 'property', 'rent', 'lease', 'mortgage', 'cap rate',
2385
+ 'noi', 'net operating income', 'cash flow', 'pro forma', 'offering memorandum',
2386
+ 'rent roll', 'operating expenses', 'vacancy rate', 'occupancy', 'square feet',
2387
+ 'acquisition', 'disposition', 'broker', 'listing', 'appraisal', 'valuation',
2388
+ 'construction', 'development', 'zoning', 'permit', 'tenant', 'landlord'
2389
+ ]
2390
+
2391
+ metrics_keywords = [
2392
+ 'metrics', 'kpi', 'key performance indicator', 'roi', 'return on investment',
2393
+ 'irr', 'internal rate of return', 'dscr', 'debt service coverage ratio',
2394
+ 'ltv', 'loan to value', 'calculation', 'analysis', 'projection', 'forecast',
2395
+ 'financial model', 'spreadsheet', 'excel', 'numbers', 'data', 'statistics'
2396
+ ]
2397
+
2398
+ text_lower = text.lower()
2399
+
2400
+ # Count keyword matches
2401
+ re_matches = sum(1 for keyword in real_estate_keywords if keyword in text_lower)
2402
+ metrics_matches = sum(1 for keyword in metrics_keywords if keyword in text_lower)
2403
+
2404
+ total_matches = re_matches + metrics_matches
2405
+
2406
+ # Calculate confidence based on matches
2407
+ confidence = min(1.0, total_matches / 10) # Normalize
2408
+
2409
+ relevant = confidence >= confidence_threshold and (re_matches >= 2 or metrics_matches >= 2)
2410
+
2411
+ key_indicators = []
2412
+ if re_matches > 0:
2413
+ key_indicators.append(f"Real estate terms found: {re_matches}")
2414
+ if metrics_matches > 0:
2415
+ key_indicators.append(f"Metrics terms found: {metrics_matches}")
2416
+
2417
+ return {
2418
+ "relevant": relevant,
2419
+ "confidence": round(confidence, 2),
2420
+ "reason": f"Keyword analysis: {re_matches} real estate terms, {metrics_matches} metrics terms",
2421
+ "key_indicators": key_indicators,
2422
+ "document_type": "unknown"
2423
+ }
2424
+
2425
+
2426
+ def generate_analysis_summary(analysis_results: List[Dict]) -> Dict:
2427
+ """Generate summary of document analysis"""
2428
+
2429
+ relevant_files = [r for r in analysis_results if r.get('relevant', False)]
2430
+ non_relevant_files = [r for r in analysis_results if not r.get('relevant', False)]
2431
+
2432
+ # Calculate average confidence
2433
+ confidences = [r.get('confidence', 0) for r in analysis_results if r.get('confidence') is not None]
2434
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0
2435
+
2436
+ # Document type distribution
2437
+ doc_types = {}
2438
+ for result in analysis_results:
2439
+ doc_type = result.get('document_type', 'unknown')
2440
+ doc_types[doc_type] = doc_types.get(doc_type, 0) + 1
2441
+
2442
+ return {
2443
+ "relevant_count": len(relevant_files),
2444
+ "non_relevant_count": len(non_relevant_files),
2445
+ "relevance_rate": len(relevant_files) / len(analysis_results) if analysis_results else 0,
2446
+ "average_confidence": round(avg_confidence, 3),
2447
+ "document_type_breakdown": doc_types,
2448
+ "processing_time_seconds": sum(r.get('processing_time_seconds', 0) for r in analysis_results)
2449
+ }
2450
+
2451
+
2452
+ def secure_filename(filename: str) -> str:
2453
+ """Sanitize filename for security"""
2454
+ import re
2455
+ filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
2456
+ return filename
2457
+
2458
+
2459
+
2460
 
2461
  def process_pdfs(pdf_files):
2462
  """Process uploaded PDFs and return Excel file"""