Spaces:

Ash2749
/

advanced-multilang-ocr

Sleeping

App Files Files Community

advanced-multilang-ocr / main.py

Ash2749

Upload 17 files

befccc3 verified 5 months ago

raw

history blame contribute delete

24 kB

	# main.py - FastAPI Web Service for Advanced Multi-Language OCR
	# Integrates main6_pix2text.py functionality with REST API endpoints

	import os
	import json
	import shutil
	from datetime import datetime
	from pathlib import Path
	from typing import Optional, Dict, Any, List
	import uvicorn

	from fastapi import FastAPI, File, UploadFile, HTTPException, status
	from fastapi.responses import HTMLResponse, JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel, Field

	# Import our OCR functionality
	from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text

	# Import evaluation functionality
	from eval import evaluate_ocr_accuracy, clean_control_characters

	# Initialize FastAPI app
	app = FastAPI(
	title="Advanced Multi-Language OCR API",
	description="""
	🔍 Advanced OCR System for Multi-Language Text Extraction

	This API provides sophisticated text extraction from PDF documents containing:
	- English text
	- Bangla (Bengali) text
	- Mathematical expressions and formulas

	## Features
	- Upload PDF files for processing
	- Intelligent content classification
	- Pix2Text integration for advanced math extraction
	- Character-by-character analysis
	- Comprehensive extraction reports

	## Usage
	1. Upload a PDF using the `/extract` endpoint
	2. Get extracted text with detailed analysis
	3. Files are saved with organized naming convention
	""",
	version="1.0.0",
	contact={
	"name": "Advanced OCR System",
	"url": "https://github.com/ashfaqbracu/aaladinai",
	},
	)

	# Add CORS middleware with proper encoding support
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	# Exception handler for JSON decode errors
	@app.exception_handler(422)
	async def json_decode_error_handler(request, exc):
	"""
	Handle JSON decode errors that occur due to control characters in request body.
	"""
	if "JSON decode error" in str(exc.detail):
	return JSONResponse(
	status_code=400,
	content={
	"error": "Invalid characters in request",
	"message": "The request contains invalid control characters. Please ensure your text data is properly encoded and does not contain control characters.",
	"suggestion": "Try cleaning your input text to remove any control characters before sending the request.",
	"details": str(exc.detail) if hasattr(exc, "detail") else str(exc),
	},
	)
	# For other 422 errors, return the original error
	raise exc


	# Create directories
	def create_directories():
	"""Create necessary directories for file storage."""
	directories = ["documents", "extracted"]
	for directory in directories:
	Path(directory).mkdir(exist_ok=True)
	print(f"✅ Created/verified directory: {directory}")


	# Initialize directories on startup
	create_directories()


	# Pydantic models for API responses
	class ExtractionResult(BaseModel):
	"""Model for OCR extraction results."""

	success: bool = Field(description="Whether the extraction was successful")
	filename: str = Field(description="Name of the processed file")
	extracted_text: str = Field(description="Full extracted text content")
	text_file_path: str = Field(description="Path to the saved text file")
	json_file_path: str = Field(description="Path to the detailed JSON results")
	analysis_file_path: str = Field(description="Path to the analysis report")
	extraction_summary: Dict[str, Any] = Field(
	description="Summary of extraction statistics"
	)
	processing_time: float = Field(description="Time taken for processing (seconds)")
	error_message: Optional[str] = Field(
	default=None, description="Error message if extraction failed"
	)


	class HealthCheck(BaseModel):
	"""Model for health check response."""

	status: str = Field(description="Service status")
	message: str = Field(description="Status message")
	pix2text_available: bool = Field(description="Whether Pix2Text is available")
	directories: Dict[str, bool] = Field(description="Directory availability status")


	class FileInfo(BaseModel):
	"""Model for file information."""

	filename: str = Field(description="Name of the file")
	size: int = Field(description="File size in bytes")
	content_type: str = Field(description="MIME content type")
	upload_time: str = Field(description="Upload timestamp")


	class EvaluationRequest(BaseModel):
	"""Model for OCR evaluation request with optional name."""

	evaluation_name: Optional[str] = Field(
	default=None, description="Optional name for the evaluation"
	)


	class EvaluationResult(BaseModel):
	"""Model for OCR evaluation results."""

	success: bool = Field(description="Whether the evaluation was successful")
	evaluation_name: Optional[str] = Field(
	default=None, description="Name of the evaluation"
	)
	overall_accuracy: float = Field(description="Overall accuracy score (0-100)")
	similarity_score: float = Field(description="Text similarity score (0-100)")
	character_metrics: Dict[str, Any] = Field(
	description="Character-level accuracy metrics"
	)
	word_metrics: Dict[str, Any] = Field(description="Word-level accuracy metrics")
	line_metrics: Dict[str, Any] = Field(description="Line-level accuracy metrics")
	language_specific: Dict[str, Any] = Field(
	description="Language-specific accuracy metrics"
	)
	text_statistics: Dict[str, Any] = Field(description="Text comparison statistics")
	detailed_diff: List[Dict[str, str]] = Field(
	description="Detailed diff showing changes"
	)
	evaluation_summary: Dict[str, Any] = Field(
	description="Summary with grade and recommendations"
	)
	processing_time: float = Field(description="Time taken for evaluation (seconds)")
	error_message: Optional[str] = Field(
	default=None, description="Error message if evaluation failed"
	)


	# Utility functions
	def get_safe_filename(filename: str) -> str:
	"""Generate a safe filename with timestamp."""
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	name, ext = os.path.splitext(filename)
	# Remove special characters and replace spaces
	safe_name = "".join(c for c in name if c.isalnum() or c in ("-", "_")).rstrip()
	return f"{safe_name}_{timestamp}{ext}"


	def get_extraction_filename(pdf_filename: str, file_type: str) -> str:
	"""Generate extraction filename with convention: [pdf_filename]_extract.[extension]"""
	base_name = os.path.splitext(pdf_filename)[0]
	extensions = {"txt": "txt", "json": "json", "analysis": "json"}
	return f"{base_name}_extract.{extensions.get(file_type, 'txt')}"


	# API Routes


	@app.get("/", response_class=HTMLResponse, tags=["Main"])
	async def root():
	"""
	🏠 Main Route - Welcome Page

	Returns a welcome page with information about the OCR API service.
	"""
	html_content = """
	<!DOCTYPE html>
	<html>
	<head>
	<title>Advanced Multi-Language OCR API</title>
	<style>
	body { font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; }
	.container { background: white; padding: 30px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
	h1 { color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; }
	.feature { background: #ecf0f1; padding: 15px; margin: 10px 0; border-radius: 5px; }
	.highlight { color: #e74c3c; font-weight: bold; }
	.button { background: #3498db; color: white; padding: 10px 20px; text-decoration: none; border-radius: 5px; display: inline-block; margin: 10px 5px; }
	.stats { display: flex; gap: 20px; margin: 20px 0; }
	.stat { background: #2ecc71; color: white; padding: 15px; border-radius: 5px; text-align: center; flex: 1; }
	</style>
	</head>
	<body>
	<div class="container">
	<h1>🔍 Advanced Multi-Language OCR API</h1>

	<p>Welcome to the <span class="highlight">most advanced OCR system</span> for extracting text from PDFs containing mixed languages and mathematical expressions!</p>

	<div class="stats">
	<div class="stat">
	<h3>🌐 Multi-Language</h3>
	<p>English + Bangla + Math</p>
	</div>
	<div class="stat">
	<h3>🧠 AI-Powered</h3>
	<p>Pix2Text Integration</p>
	</div>
	<div class="stat">
	<h3>📊 Detailed Analysis</h3>
	<p>Character-level Classification</p>
	</div>
	</div>

	<h2>🚀 Key Features</h2>
	<div class="feature">
	<strong>📄 PDF Processing:</strong> Upload PDFs and get comprehensive text extraction
	</div>
	<div class="feature">
	<strong>🔤 Language Detection:</strong> Automatic classification of English, Bangla, and mathematical content
	</div>
	<div class="feature">
	<strong>🧮 Mathematical Expressions:</strong> Advanced LaTeX and formula recognition using Pix2Text
	</div>
	<div class="feature">
	<strong>📈 Detailed Reports:</strong> Character analysis, confidence scores, and extraction statistics
	</div>
	<div class="feature">
	<strong>💾 Organized Storage:</strong> Automatic file organization with clear naming conventions
	</div>
	<div class="feature">
	<strong>📊 Accuracy Evaluation:</strong> Upload text files to compare OCR results with baseline and get detailed accuracy metrics
	</div>

	<h2>📋 API Endpoints</h2>
	<p><strong>GET /:</strong> This welcome page</p>
	<p><strong>GET /health:</strong> Service health status</p>
	<p><strong>POST /extract:</strong> Upload PDF and extract text</p>
	<p><strong>POST /eval:</strong> Upload two text files to evaluate OCR accuracy</p>

	<h2>🔗 Quick Actions</h2>
	<a href="/docs" class="button">📖 API Documentation (Swagger)</a>
	<a href="/redoc" class="button">📚 Alternative Docs (ReDoc)</a>
	<a href="/health" class="button">🏥 Health Check</a>

	<hr style="margin: 30px 0;">
	<p style="text-align: center; color: #7f8c8d;">
	<strong>Advanced Multi-Language OCR System</strong><br>
	Powered by Tesseract, Pix2Text, and FastAPI<br>
	<a href="https://github.com/ashfaqbracu/aaladinai" style="color: #3498db;">GitHub Repository</a>
	</p>
	</div>
	</body>
	</html>
	"""
	return html_content


	@app.get("/health", response_model=HealthCheck, tags=["System"])
	async def health_check():
	"""
	🏥 Health Check

	Check the health status of the OCR service and its dependencies.
	"""
	try:
	# Check if Pix2Text is available
	pix2text_model = initialize_pix2text()
	pix2text_available = pix2text_model is not None

	# Check directories
	directories_status = {
	"documents": Path("documents").exists(),
	"extracted": Path("extracted").exists(),
	}

	# Create directories if they don't exist
	for directory in directories_status:
	if not directories_status[directory]:
	Path(directory).mkdir(exist_ok=True)
	directories_status[directory] = True

	return HealthCheck(
	status="healthy",
	message="OCR service is running and ready to process files",
	pix2text_available=pix2text_available,
	directories=directories_status,
	)
	except Exception as e:
	return HealthCheck(
	status="unhealthy",
	message=f"Service health check failed: {str(e)}",
	pix2text_available=False,
	directories={"documents": False, "extracted": False},
	)


	@app.post("/extract", response_model=ExtractionResult, tags=["OCR Processing"])
	async def extract_text_from_pdf(
	file: UploadFile = File(
	..., description="PDF file to extract text from", media_type="application/pdf"
	),
	):
	"""
	📄 Extract Text from PDF

	Upload a PDF file and extract text using advanced multi-language OCR.

	## Process:
	1. Upload: PDF is saved to `documents/` folder
	2. Processing: Advanced OCR with language detection
	3. Extraction: Text, JSON, and analysis files generated
	4. Storage: Results saved to `extracted/` folder with naming convention

	## Naming Convention:
	- Input: `document.pdf` → Output: `document_extract.txt`, `document_extract.json`

	## Returns:
	- Complete extracted text
	- Detailed extraction metadata
	- Processing statistics and analysis
	"""

	start_time = datetime.now()

	try:
	# Validate file type
	if not file.content_type == "application/pdf":
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail=f"Invalid file type. Expected PDF, got {file.content_type}",
	)

	if not file.filename.lower().endswith(".pdf"):
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail="File must have .pdf extension",
	)

	# Generate safe filename and save uploaded file
	safe_filename = get_safe_filename(file.filename)
	documents_path = Path("documents") / safe_filename

	# Save uploaded file
	with open(documents_path, "wb") as buffer:
	shutil.copyfileobj(file.file, buffer)

	print(f"📁 File saved to: {documents_path}")

	# Generate output filenames using naming convention
	text_filename = get_extraction_filename(safe_filename, "txt")
	json_filename = get_extraction_filename(safe_filename, "json")
	analysis_filename = get_extraction_filename(safe_filename, "analysis")

	# Create full paths for extracted files
	text_path = Path("extracted") / text_filename
	json_path = Path("extracted") / json_filename
	analysis_path = Path("extracted") / analysis_filename

	print(f"🔄 Starting OCR processing for: {safe_filename}")

	# Process the PDF using our advanced OCR system
	extract_all_text_advanced_pix2text(
	pdf_path=str(documents_path),
	output_text_file=str(text_path),
	output_json_file=str(json_path),
	output_analysis_file=str(analysis_path),
	)

	# Read the extracted text
	with open(text_path, "r", encoding="utf-8") as f:
	extracted_text = f.read()

	# Read the analysis for summary
	with open(analysis_path, "r", encoding="utf-8") as f:
	analysis_data = json.load(f)

	# Calculate processing time
	end_time = datetime.now()
	processing_time = (end_time - start_time).total_seconds()

	print(f"✅ OCR processing completed in {processing_time:.2f} seconds")
	print(f"📊 Extracted {len(extracted_text)} characters")

	return ExtractionResult(
	success=True,
	filename=file.filename,
	extracted_text=extracted_text,
	text_file_path=str(text_path),
	json_file_path=str(json_path),
	analysis_file_path=str(analysis_path),
	extraction_summary=analysis_data,
	processing_time=processing_time,
	error_message=None,
	)

	except HTTPException:
	# Re-raise HTTP exceptions
	raise
	except Exception as e:
	# Handle unexpected errors
	error_message = f"OCR processing failed: {str(e)}"
	print(f"❌ {error_message}")

	processing_time = (datetime.now() - start_time).total_seconds()

	return ExtractionResult(
	success=False,
	filename=file.filename if file else "unknown",
	extracted_text="",
	text_file_path="",
	json_file_path="",
	analysis_file_path="",
	extraction_summary={},
	processing_time=processing_time,
	error_message=error_message,
	)


	@app.post("/eval", response_model=EvaluationResult, tags=["OCR Evaluation"])
	async def evaluate_ocr_extraction(
	extracted_file: UploadFile = File(
	..., description="Text file containing OCR extracted text"
	),
	baseline_file: UploadFile = File(
	..., description="Text file containing ground truth baseline text"
	),
	evaluation_name: Optional[str] = None,
	):
	"""
	📊 Evaluate OCR Extraction Accuracy

	Compare extracted text file with ground truth baseline file to measure OCR accuracy.

	## Features:
	- File-based input: Upload two text files for comparison
	- Character-level accuracy: Precise character matching and edit distance
	- Word-level accuracy: Word matching and error rates
	- Line-level accuracy: Line comparison and similarity scores
	- Language-specific metrics: Separate accuracy for English, Bangla, and Math
	- Overall grading: Letter grade system (A+ to F)
	- Detailed diff: Character-by-character comparison
	- Recommendations: Suggestions for improving OCR accuracy

	## Input:
	- `extracted_file`: Text file with OCR extracted content (.txt format)
	- `baseline_file`: Text file with ground truth content (.txt format)
	- `evaluation_name`: Optional name for the evaluation

	## Output:
	- Comprehensive accuracy metrics
	- Performance grading
	- Detailed comparison analysis
	- Improvement recommendations
	"""
	start_time = datetime.now()

	try:
	print(f"🔍 Starting OCR evaluation: {evaluation_name or 'Unnamed'}")

	# Validate file types
	if not extracted_file.filename.lower().endswith(".txt"):
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail="Extracted file must be a .txt file",
	)

	if not baseline_file.filename.lower().endswith(".txt"):
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail="Baseline file must be a .txt file",
	)

	# Read file contents
	try:
	extracted_content = await extracted_file.read()
	baseline_content = await baseline_file.read()

	# Decode with UTF-8 and handle potential encoding issues
	extracted_text = extracted_content.decode("utf-8", errors="replace")
	baseline_text = baseline_content.decode("utf-8", errors="replace")

	print(f"📁 Files read successfully:")
	print(
	f" Extracted file: {extracted_file.filename} ({len(extracted_text)} characters)"
	)
	print(
	f" Baseline file: {baseline_file.filename} ({len(baseline_text)} characters)"
	)

	except Exception as e:
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail=f"Failed to read file contents: {str(e)}",
	)

	# Validate that baseline text is not empty
	if not baseline_text.strip():
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail="Baseline file cannot be empty",
	)

	# Clean input texts to prevent any issues
	from eval import clean_control_characters

	extracted_text_clean = clean_control_characters(extracted_text)
	baseline_text_clean = clean_control_characters(baseline_text)

	print(
	f"📝 Text lengths after cleaning - Extracted: {len(extracted_text_clean)}, Baseline: {len(baseline_text_clean)}"
	)

	# Perform evaluation with cleaned texts
	evaluation_results = evaluate_ocr_accuracy(
	extracted_text=extracted_text_clean,
	baseline_text=baseline_text_clean,
	)

	# Check for evaluation errors
	if "error" in evaluation_results:
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail=evaluation_results["error"],
	)

	# Calculate processing time
	end_time = datetime.now()
	processing_time = (end_time - start_time).total_seconds()

	print(f"✅ Evaluation completed in {processing_time:.3f} seconds")
	print(f"📊 Overall accuracy: {evaluation_results['overall_accuracy']:.2f}%")
	print(f"🎯 Grade: {evaluation_results['evaluation_summary']['grade']}")

	return EvaluationResult(
	success=True,
	evaluation_name=evaluation_name,
	overall_accuracy=evaluation_results["overall_accuracy"],
	similarity_score=evaluation_results["similarity_score"],
	character_metrics=evaluation_results["character_metrics"],
	word_metrics=evaluation_results["word_metrics"],
	line_metrics=evaluation_results["line_metrics"],
	language_specific=evaluation_results["language_specific"],
	text_statistics=evaluation_results["text_statistics"],
	detailed_diff=evaluation_results["detailed_diff"],
	evaluation_summary=evaluation_results["evaluation_summary"],
	processing_time=processing_time,
	error_message=None,
	)

	except HTTPException:
	# Re-raise HTTP exceptions
	raise
	except Exception as e:
	# Handle unexpected errors
	error_message = f"Evaluation failed: {str(e)}"
	print(f"❌ {error_message}")

	processing_time = (datetime.now() - start_time).total_seconds()

	return EvaluationResult(
	success=False,
	evaluation_name=evaluation_name,
	overall_accuracy=0.0,
	similarity_score=0.0,
	character_metrics={},
	word_metrics={},
	line_metrics={},
	language_specific={},
	text_statistics={},
	detailed_diff=[],
	evaluation_summary={
	"grade": "F (Error)",
	"recommendations": [error_message],
	},
	processing_time=processing_time,
	error_message=error_message,
	)


	# Development server configuration
	if __name__ == "__main__":
	print("🚀 Starting Advanced Multi-Language OCR API Server...")
	print("📖 API Documentation: http://localhost:8000/docs")
	print("🏠 Main Page: http://localhost:8000/")
	print("🏥 Health Check: http://localhost:8000/health")

	# Get port from environment variable for Render deployment, default to 8000 for local development
	port = int(os.environ.get("PORT", 7860))
	uvicorn.run("main:app", host="0.0.0.0", port=port, log_level="info")