Spaces:
Running
Running
| import httpx | |
| import os | |
| from typing import Dict, Any, Optional | |
| from pathlib import Path | |
| class AssetAnalyzer: | |
| """Service to analyze uploaded assets using OCR API and extract content""" | |
| def __init__(self): | |
| self.ocr_api_url = os.getenv("OCR_API_URL", "https://seth0330-ezofisocr.hf.space") | |
| self.ocr_api_key = os.getenv("OCR_API_KEY", "") | |
| async def analyze_document(self, file_path: str, key_fields: Optional[str] = None) -> Dict[str, Any]: | |
| """ | |
| Analyze a document using the OCR API | |
| Args: | |
| file_path: Path to the file to analyze | |
| key_fields: Optional comma-separated string of key fields to extract | |
| Returns: | |
| Dictionary containing extracted content and metadata | |
| """ | |
| try: | |
| file_path_obj = Path(file_path) | |
| if not file_path_obj.exists(): | |
| return { | |
| "success": False, | |
| "error": "File not found", | |
| "extracted_content": None | |
| } | |
| # Determine if this is a document that should be analyzed | |
| file_type = self._get_file_type(file_path) | |
| if file_type not in ["document", "image"]: | |
| return { | |
| "success": True, | |
| "extracted_content": None, | |
| "message": f"File type {file_type} not suitable for OCR analysis" | |
| } | |
| # Read file content | |
| with open(file_path, 'rb') as f: | |
| files = {'file': (file_path_obj.name, f, self._get_content_type(file_path))} | |
| data = {} | |
| if key_fields: | |
| data['key_fields'] = key_fields | |
| headers = {} | |
| if self.ocr_api_key: | |
| headers["X-API-Key"] = self.ocr_api_key | |
| async with httpx.AsyncClient(timeout=60.0) as client: | |
| response = await client.post( | |
| f"{self.ocr_api_url}/api/extract", | |
| headers=headers, | |
| files=files, | |
| data=data | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| return { | |
| "success": True, | |
| "extracted_content": result, | |
| "message": "Document analyzed successfully" | |
| } | |
| else: | |
| return { | |
| "success": False, | |
| "error": f"OCR API returned status {response.status_code}: {response.text}", | |
| "extracted_content": None | |
| } | |
| except Exception as e: | |
| return { | |
| "success": False, | |
| "error": str(e), | |
| "extracted_content": None | |
| } | |
| async def analyze_image(self, file_path: str) -> Dict[str, Any]: | |
| """ | |
| Analyze an image using GPT-4 Vision (for screenshots, infographics, etc.) | |
| This is a placeholder for future implementation | |
| Args: | |
| file_path: Path to the image file | |
| Returns: | |
| Dictionary containing image analysis | |
| """ | |
| # TODO: Implement GPT-4 Vision analysis for images | |
| # For now, return a placeholder | |
| return { | |
| "success": True, | |
| "extracted_content": { | |
| "type": "image", | |
| "message": "Image analysis not yet implemented" | |
| }, | |
| "message": "Image analysis placeholder" | |
| } | |
| def _get_file_type(self, file_path: str) -> str: | |
| """Determine file type from extension""" | |
| ext = Path(file_path).suffix.lower() | |
| document_extensions = ['.pdf', '.doc', '.docx', '.txt', '.rtf'] | |
| image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg'] | |
| video_extensions = ['.mp4', '.avi', '.mov', '.wmv', '.flv'] | |
| if ext in document_extensions: | |
| return "document" | |
| elif ext in image_extensions: | |
| return "image" | |
| elif ext in video_extensions: | |
| return "video" | |
| else: | |
| return "unknown" | |
| def _get_content_type(self, file_path: str) -> str: | |
| """Get MIME type for file""" | |
| ext = Path(file_path).suffix.lower() | |
| content_types = { | |
| '.pdf': 'application/pdf', | |
| '.doc': 'application/msword', | |
| '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | |
| '.txt': 'text/plain', | |
| '.jpg': 'image/jpeg', | |
| '.jpeg': 'image/jpeg', | |
| '.png': 'image/png', | |
| '.gif': 'image/gif', | |
| } | |
| return content_types.get(ext, 'application/octet-stream') | |
| def extract_key_insights(self, extracted_content: Dict[str, Any]) -> str: | |
| """ | |
| Extract key insights from OCR results to use as context for AI content generation | |
| Args: | |
| extracted_content: The JSON response from OCR API | |
| Returns: | |
| Formatted string with key insights | |
| """ | |
| if not extracted_content: | |
| return "" | |
| insights = [] | |
| # Extract structured data if available | |
| if isinstance(extracted_content, dict): | |
| # Look for common fields | |
| for key, value in extracted_content.items(): | |
| if value and key not in ['raw_text', 'confidence', 'metadata']: | |
| if isinstance(value, (str, int, float)): | |
| insights.append(f"{key}: {value}") | |
| elif isinstance(value, list) and len(value) > 0: | |
| insights.append(f"{key}: {', '.join(map(str, value[:5]))}") | |
| # Extract raw text if available | |
| if 'raw_text' in extracted_content: | |
| raw_text = extracted_content['raw_text'] | |
| if isinstance(raw_text, str) and len(raw_text) > 0: | |
| # Summarize long text | |
| if len(raw_text) > 500: | |
| insights.append(f"Document content: {raw_text[:500]}...") | |
| else: | |
| insights.append(f"Document content: {raw_text}") | |
| return "\n".join(insights) if insights else "No specific insights extracted" | |