Spaces:

MLBench
/

RealState_OCR

Sleeping

App Files Files Community

mlbench123 commited on Nov 26, 2025

Commit

992c406

verified ·

1 Parent(s): 40aff38

Update app.py

Browse files

Files changed (1) hide show

app.py +464 -2

app.py CHANGED Viewed

@@ -16,8 +16,6 @@ Extracts data from PDFs, solves formulas with Gemini API, generates Excel
 """
 from fastapi.middleware.cors import CORSMiddleware
 import re
 import json
 from pathlib import Path
@@ -28,6 +26,12 @@ from openpyxl.utils import get_column_letter
 from pdfminer.high_level import extract_text
 import google.generativeai as genai
 class RealEstateModelPipeline:
     def __init__(self, gemini_api_key: str):
         """Initialize pipeline with Gemini API key"""
@@ -1995,6 +1999,464 @@ async def analyze_only(files: List[UploadFile] = File(...)):
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 def process_pdfs(pdf_files):
     """Process uploaded PDFs and return Excel file"""

 """
 from fastapi.middleware.cors import CORSMiddleware
 import re
 import json
 from pathlib import Path
 from pdfminer.high_level import extract_text
 import google.generativeai as genai
+# Add logging configuration
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class RealEstateModelPipeline:
     def __init__(self, gemini_api_key: str):
         """Initialize pipeline with Gemini API key"""
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/analyze-documents")
+async def analyze_documents(
+    files: List[UploadFile] = File(...),
+    max_pages_per_doc: int = 2,
+    confidence_threshold: float = 0.7
+):
+    """
+    Industrial-scale document relevance analysis endpoint
+    Analyzes uploaded documents to determine if they are relevant to real estate
+    and metrics calculation. Uses first few pages for efficiency.
+    Parameters:
+    - files: List of document files (PDF, XLSX, DOCX, etc.)
+    - max_pages_per_doc: Maximum number of pages to analyze per document (default: 2)
+    - confidence_threshold: Minimum confidence score to mark as relevant (default: 0.7)
+    Returns:
+    - JSON with relevance analysis for each file
+    """
+    if not files:
+        raise HTTPException(status_code=400, detail="No files uploaded")
+    # Validate input parameters
+    if max_pages_per_doc < 1 or max_pages_per_doc > 10:
+        raise HTTPException(status_code=400, detail="max_pages_per_doc must be between 1 and 10")
+    if confidence_threshold < 0.1 or confidence_threshold > 1.0:
+        raise HTTPException(status_code=400, detail="confidence_threshold must be between 0.1 and 1.0")
+    temp_dir = None
+    try:
+        # Create temporary directory with unique name
+        temp_dir = tempfile.mkdtemp(prefix="doc_analysis_")
+        logger.info(f"Created temp directory: {temp_dir}")
+        # Process files in parallel for better performance
+        analysis_results = await process_documents_parallel(
+            files, temp_dir, max_pages_per_doc, confidence_threshold
+        )
+        # Generate overall summary
+        summary = generate_analysis_summary(analysis_results)
+        response = {
+            "status": "success",
+            "summary": summary,
+            "analysis": analysis_results,
+            "metadata": {
+                "total_files": len(files),
+                "relevant_files": summary["relevant_count"],
+                "non_relevant_files": summary["non_relevant_count"],
+                "confidence_threshold": confidence_threshold,
+                "max_pages_analyzed": max_pages_per_doc,
+                "processing_time_seconds": summary["processing_time_seconds"]
+            }
+        }
+        logger.info(f"Document analysis completed: {summary['relevant_count']}/{len(files)} relevant files")
+        return JSONResponse(content=response)
+    except Exception as e:
+        logger.error(f"Document analysis error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Document analysis failed: {str(e)}"
+        )
+    finally:
+        # Cleanup temporary directory
+        if temp_dir and os.path.exists(temp_dir):
+            try:
+                shutil.rmtree(temp_dir)
+                logger.info(f"Cleaned up temp directory: {temp_dir}")
+            except Exception as e:
+                logger.warning(f"Failed to cleanup temp directory: {str(e)}")
+async def process_documents_parallel(
+    files: List[UploadFile],
+    temp_dir: str,
+    max_pages: int,
+    confidence_threshold: float
+) -> List[Dict]:
+    """Process documents in parallel for better performance"""
+    import asyncio
+    # Save all files first
+    saved_paths = []
+    for upload_file in files:
+        file_path = Path(temp_dir) / secure_filename(upload_file.filename)
+        with open(file_path, "wb") as f:
+            content = await upload_file.read()
+            f.write(content)
+        saved_paths.append((file_path, upload_file.filename, upload_file.content_type))
+    # Process files concurrently
+    tasks = []
+    for file_path, filename, content_type in saved_paths:
+        task = analyze_single_document(
+            file_path, filename, content_type, max_pages, confidence_threshold
+        )
+        tasks.append(task)
+    # Use asyncio.gather for concurrent processing
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    # Handle exceptions in individual file processing
+    processed_results = []
+    for i, result in enumerate(results):
+        filename = saved_paths[i][1]
+        if isinstance(result, Exception):
+            logger.error(f"Error processing {filename}: {str(result)}")
+            processed_results.append({
+                "filename": filename,
+                "relevant": False,
+                "confidence": 0.0,
+                "error": str(result),
+                "reason": "Processing failed",
+                "key_indicators": []
+            })
+        else:
+            processed_results.append(result)
+    return processed_results
+async def analyze_single_document(
+    file_path: Path,
+    filename: str,
+    content_type: str,
+    max_pages: int,
+    confidence_threshold: float
+) -> Dict:
+    """Analyze a single document for real estate relevance"""
+    start_time = time.time()
+    try:
+        # Extract text from document (first N pages)
+        extracted_text = await extract_document_text(
+            file_path, content_type, max_pages
+        )
+        if not extracted_text or len(extracted_text.strip()) < 50:
+            return {
+                "filename": filename,
+                "relevant": False,
+                "confidence": 0.0,
+                "reason": "Insufficient or unreadable text content",
+                "key_indicators": [],
+                "text_sample": extracted_text[:200] if extracted_text else ""
+            }
+        # Analyze with Gemini
+        analysis_result = await analyze_with_gemini(extracted_text, confidence_threshold)
+        processing_time = time.time() - start_time
+        return {
+            "filename": filename,
+            "relevant": analysis_result["relevant"],
+            "confidence": analysis_result["confidence"],
+            "reason": analysis_result["reason"],
+            "key_indicators": analysis_result["key_indicators"],
+            "document_type": analysis_result.get("document_type", "unknown"),
+            "text_sample": extracted_text[:500],  # First 500 chars for debugging
+            "processing_time_seconds": round(processing_time, 2),
+            "pages_analyzed": min(max_pages, estimate_page_count(file_path, content_type))
+        }
+    except Exception as e:
+        logger.error(f"Error analyzing {filename}: {str(e)}")
+        return {
+            "filename": filename,
+            "relevant": False,
+            "confidence": 0.0,
+            "error": str(e),
+            "reason": "Analysis error",
+            "key_indicators": []
+        }
+async def extract_document_text(file_path: Path, content_type: str, max_pages: int) -> str:
+    """Extract text from document with page limit"""
+    file_extension = file_path.suffix.lower()
+    try:
+        if file_extension == '.pdf':
+            return extract_pdf_text_limited(file_path, max_pages)
+        elif file_extension in ['.xlsx', '.xls']:
+            return extract_excel_text_limited(file_path, max_pages)
+        elif file_extension in ['.docx', '.doc']:
+            return extract_docx_text_limited(file_path, max_pages)
+        elif file_extension in ['.txt', '.csv']:
+            return extract_text_file_limited(file_path, max_pages)
+        else:
+            # Fallback: try to read as text
+            return extract_text_file_limited(file_path, max_pages)
+    except Exception as e:
+        logger.warning(f"Text extraction failed for {file_path}: {str(e)}")
+        return ""
+def extract_pdf_text_limited(pdf_path: Path, max_pages: int) -> str:
+    """Extract text from first N pages of PDF"""
+    try:
+        from pdfminer.high_level import extract_text
+        from pdfminer.layout import LAParams
+        # Extract only first N pages
+        text = extract_text(
+            str(pdf_path),
+            laparams=LAParams(),
+            maxpages=max_pages
+        )
+        return text.strip()
+    except Exception as e:
+        logger.error(f"PDF extraction error: {str(e)}")
+        return ""
+def extract_excel_text_limited(excel_path: Path, max_sheets: int) -> str:
+    """Extract text from first N sheets of Excel file"""
+    try:
+        import pandas as pd
+        extracted_content = []
+        xlsx = pd.ExcelFile(excel_path)
+        # Limit number of sheets processed
+        sheets_to_process = xlsx.sheet_names[:max_sheets]
+        for sheet_name in sheets_to_process:
+            try:
+                df = pd.read_excel(xlsx, sheet_name=sheet_name, nrows=50)  # First 50 rows
+                extracted_content.append(f"=== Sheet: {sheet_name} ===")
+                extracted_content.append(df.to_string(index=False, max_rows=20))
+                extracted_content.append("\n")
+            except Exception as e:
+                logger.warning(f"Could not read sheet {sheet_name}: {str(e)}")
+                continue
+        return "\n".join(extracted_content)
+    except Exception as e:
+        logger.error(f"Excel extraction error: {str(e)}")
+        return ""
+def extract_docx_text_limited(docx_path: Path, max_pages: int) -> str:
+    """Extract text from first N pages of DOCX (estimated)"""
+    try:
+        import docx
+        doc = docx.Document(str(docx_path))
+        full_text = []
+        # Estimate pages by paragraphs (rough approximation)
+        paragraphs_processed = 0
+        paragraphs_per_page = 10  # Rough estimate
+        for paragraph in doc.paragraphs:
+            if paragraphs_processed >= max_pages * paragraphs_per_page:
+                break
+            if paragraph.text.strip():
+                full_text.append(paragraph.text)
+                paragraphs_processed += 1
+        return "\n".join(full_text)
+    except Exception as e:
+        logger.error(f"DOCX extraction error: {str(e)}")
+        return ""
+def extract_text_file_limited(file_path: Path, max_pages: int) -> str:
+    """Extract limited text from text file"""
+    try:
+        lines_per_page = 50
+        max_lines = max_pages * lines_per_page
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            lines = []
+            for i, line in enumerate(f):
+                if i >= max_lines:
+                    break
+                lines.append(line)
+        return "".join(lines)
+    except Exception as e:
+        logger.error(f"Text file extraction error: {str(e)}")
+        return ""
+def estimate_page_count(file_path: Path, content_type: str) -> int:
+    """Estimate number of pages in document"""
+    # Simple estimation - can be enhanced based on file type
+    return 1
+async def analyze_with_gemini(text: str, confidence_threshold: float) -> Dict:
+    """Use Gemini to analyze document relevance"""
+    prompt = f"""
+    Analyze this document text and determine if it's relevant to REAL ESTATE and METRICS CALCULATION.
+    CRITICAL: You must respond with ONLY a JSON object, no other text.
+    DOCUMENT TEXT (first few pages):
+    {text[:8000]}  # Limit text to avoid token limits
+    ANALYSIS INSTRUCTIONS:
+    1. Determine if this document is relevant to real estate business, investments, or metrics
+    2. Identify key indicators that support your decision
+    3. Provide a confidence score (0.0 to 1.0)
+    4. Classify the document type if possible
+    RELEVANCE CRITERIA:
+    - Real estate related: property listings, financial models, market analysis, offering memorandums, rent rolls, operating statements
+    - Metrics calculation: financial projections, ROI analysis, cap rates, NOI calculations, cash flow analysis
+    - Real estate development: construction costs, pro formas, feasibility studies
+    NON-RELEVANT EXAMPLES:
+    - Resumes, personal documents, marketing brochures for non-real estate
+    - Academic papers unrelated to real estate
+    - General business documents without real estate focus
+    REQUIRED JSON RESPONSE FORMAT:
+    {{
+        "relevant": true/false,
+        "confidence": 0.85,
+        "reason": "Brief explanation of relevance decision",
+        "key_indicators": ["indicator1", "indicator2", ...],
+        "document_type": "offering_memorandum|financial_statement|market_report|rent_roll|unknown"
+    }}
+    Confidence threshold for relevance: {confidence_threshold}
+    """
+    try:
+        # Initialize Gemini
+        genai.configure(api_key=GEMINI_API_KEY)
+        model = genai.GenerativeModel('gemini-2.0-flash')
+        response = await asyncio.get_event_loop().run_in_executor(
+            None,
+            lambda: model.generate_content(prompt)
+        )
+        response_text = response.text.strip()
+        # Clean JSON response
+        if "```json" in response_text:
+            response_text = response_text.split("```json")[1].split("```")[0].strip()
+        elif "```" in response_text:
+            response_text = response_text.split("```")[1].split("```")[0].strip()
+        result = json.loads(response_text)
+        # Validate response structure
+        required_fields = ["relevant", "confidence", "reason", "key_indicators"]
+        for field in required_fields:
+            if field not in result:
+                raise ValueError(f"Missing field in Gemini response: {field}")
+        # Apply confidence threshold
+        if result["confidence"] < confidence_threshold:
+            result["relevant"] = False
+            result["reason"] = f"Confidence ({result['confidence']}) below threshold ({confidence_threshold})"
+        return result
+    except Exception as e:
+        logger.error(f"Gemini analysis failed: {str(e)}")
+        # Fallback: simple keyword-based analysis
+        return perform_fallback_analysis(text, confidence_threshold)
+def perform_fallback_analysis(text: str, confidence_threshold: float) -> Dict:
+    """Fallback analysis using keyword matching when Gemini fails"""
+    real_estate_keywords = [
+        'real estate', 'property', 'rent', 'lease', 'mortgage', 'cap rate',
+        'noi', 'net operating income', 'cash flow', 'pro forma', 'offering memorandum',
+        'rent roll', 'operating expenses', 'vacancy rate', 'occupancy', 'square feet',
+        'acquisition', 'disposition', 'broker', 'listing', 'appraisal', 'valuation',
+        'construction', 'development', 'zoning', 'permit', 'tenant', 'landlord'
+    ]
+    metrics_keywords = [
+        'metrics', 'kpi', 'key performance indicator', 'roi', 'return on investment',
+        'irr', 'internal rate of return', 'dscr', 'debt service coverage ratio',
+        'ltv', 'loan to value', 'calculation', 'analysis', 'projection', 'forecast',
+        'financial model', 'spreadsheet', 'excel', 'numbers', 'data', 'statistics'
+    ]
+    text_lower = text.lower()
+    # Count keyword matches
+    re_matches = sum(1 for keyword in real_estate_keywords if keyword in text_lower)
+    metrics_matches = sum(1 for keyword in metrics_keywords if keyword in text_lower)
+    total_matches = re_matches + metrics_matches
+    # Calculate confidence based on matches
+    confidence = min(1.0, total_matches / 10)  # Normalize
+    relevant = confidence >= confidence_threshold and (re_matches >= 2 or metrics_matches >= 2)
+    key_indicators = []
+    if re_matches > 0:
+        key_indicators.append(f"Real estate terms found: {re_matches}")
+    if metrics_matches > 0:
+        key_indicators.append(f"Metrics terms found: {metrics_matches}")
+    return {
+        "relevant": relevant,
+        "confidence": round(confidence, 2),
+        "reason": f"Keyword analysis: {re_matches} real estate terms, {metrics_matches} metrics terms",
+        "key_indicators": key_indicators,
+        "document_type": "unknown"
+    }
+def generate_analysis_summary(analysis_results: List[Dict]) -> Dict:
+    """Generate summary of document analysis"""
+    relevant_files = [r for r in analysis_results if r.get('relevant', False)]
+    non_relevant_files = [r for r in analysis_results if not r.get('relevant', False)]
+    # Calculate average confidence
+    confidences = [r.get('confidence', 0) for r in analysis_results if r.get('confidence') is not None]
+    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
+    # Document type distribution
+    doc_types = {}
+    for result in analysis_results:
+        doc_type = result.get('document_type', 'unknown')
+        doc_types[doc_type] = doc_types.get(doc_type, 0) + 1
+    return {
+        "relevant_count": len(relevant_files),
+        "non_relevant_count": len(non_relevant_files),
+        "relevance_rate": len(relevant_files) / len(analysis_results) if analysis_results else 0,
+        "average_confidence": round(avg_confidence, 3),
+        "document_type_breakdown": doc_types,
+        "processing_time_seconds": sum(r.get('processing_time_seconds', 0) for r in analysis_results)
+    }
+def secure_filename(filename: str) -> str:
+    """Sanitize filename for security"""
+    import re
+    filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
+    return filename
 def process_pdfs(pdf_files):
     """Process uploaded PDFs and return Excel file"""