PostGen2

Sleeping

PostGen2 / backend /app /services /asset_analyzer.py

Seth

update

6d1e595 4 months ago

6.61 kB

	import httpx
	import os
	from typing import Dict, Any, Optional
	from pathlib import Path

	class AssetAnalyzer:
	"""Service to analyze uploaded assets using OCR API and extract content"""

	def __init__(self):
	self.ocr_api_url = os.getenv("OCR_API_URL", "https://seth0330-ezofisocr.hf.space")
	self.ocr_api_key = os.getenv("OCR_API_KEY", "")

	async def analyze_document(self, file_path: str, key_fields: Optional[str] = None) -> Dict[str, Any]:
	"""
	Analyze a document using the OCR API

	Args:
	file_path: Path to the file to analyze
	key_fields: Optional comma-separated string of key fields to extract

	Returns:
	Dictionary containing extracted content and metadata
	"""
	try:
	file_path_obj = Path(file_path)
	if not file_path_obj.exists():
	return {
	"success": False,
	"error": "File not found",
	"extracted_content": None
	}

	# Determine if this is a document that should be analyzed
	file_type = self._get_file_type(file_path)
	if file_type not in ["document", "image"]:
	return {
	"success": True,
	"extracted_content": None,
	"message": f"File type {file_type} not suitable for OCR analysis"
	}

	# Read file content
	with open(file_path, 'rb') as f:
	files = {'file': (file_path_obj.name, f, self._get_content_type(file_path))}
	data = {}
	if key_fields:
	data['key_fields'] = key_fields

	headers = {}
	if self.ocr_api_key:
	headers["X-API-Key"] = self.ocr_api_key

	async with httpx.AsyncClient(timeout=60.0) as client:
	response = await client.post(
	f"{self.ocr_api_url}/api/extract",
	headers=headers,
	files=files,
	data=data
	)

	if response.status_code == 200:
	result = response.json()
	return {
	"success": True,
	"extracted_content": result,
	"message": "Document analyzed successfully"
	}
	else:
	return {
	"success": False,
	"error": f"OCR API returned status {response.status_code}: {response.text}",
	"extracted_content": None
	}
	except Exception as e:
	return {
	"success": False,
	"error": str(e),
	"extracted_content": None
	}

	async def analyze_image(self, file_path: str) -> Dict[str, Any]:
	"""
	Analyze an image using GPT-4 Vision (for screenshots, infographics, etc.)
	This is a placeholder for future implementation

	Args:
	file_path: Path to the image file

	Returns:
	Dictionary containing image analysis
	"""
	# TODO: Implement GPT-4 Vision analysis for images
	# For now, return a placeholder
	return {
	"success": True,
	"extracted_content": {
	"type": "image",
	"message": "Image analysis not yet implemented"
	},
	"message": "Image analysis placeholder"
	}

	def _get_file_type(self, file_path: str) -> str:
	"""Determine file type from extension"""
	ext = Path(file_path).suffix.lower()
	document_extensions = ['.pdf', '.doc', '.docx', '.txt', '.rtf']
	image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg']
	video_extensions = ['.mp4', '.avi', '.mov', '.wmv', '.flv']

	if ext in document_extensions:
	return "document"
	elif ext in image_extensions:
	return "image"
	elif ext in video_extensions:
	return "video"
	else:
	return "unknown"

	def _get_content_type(self, file_path: str) -> str:
	"""Get MIME type for file"""
	ext = Path(file_path).suffix.lower()
	content_types = {
	'.pdf': 'application/pdf',
	'.doc': 'application/msword',
	'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
	'.txt': 'text/plain',
	'.jpg': 'image/jpeg',
	'.jpeg': 'image/jpeg',
	'.png': 'image/png',
	'.gif': 'image/gif',
	}
	return content_types.get(ext, 'application/octet-stream')

	def extract_key_insights(self, extracted_content: Dict[str, Any]) -> str:
	"""
	Extract key insights from OCR results to use as context for AI content generation

	Args:
	extracted_content: The JSON response from OCR API

	Returns:
	Formatted string with key insights
	"""
	if not extracted_content:
	return ""

	insights = []

	# Extract structured data if available
	if isinstance(extracted_content, dict):
	# Look for common fields
	for key, value in extracted_content.items():
	if value and key not in ['raw_text', 'confidence', 'metadata']:
	if isinstance(value, (str, int, float)):
	insights.append(f"{key}: {value}")
	elif isinstance(value, list) and len(value) > 0:
	insights.append(f"{key}: {', '.join(map(str, value[:5]))}")

	# Extract raw text if available
	if 'raw_text' in extracted_content:
	raw_text = extracted_content['raw_text']
	if isinstance(raw_text, str) and len(raw_text) > 0:
	# Summarize long text
	if len(raw_text) > 500:
	insights.append(f"Document content: {raw_text[:500]}...")
	else:
	insights.append(f"Document content: {raw_text}")

	return "\n".join(insights) if insights else "No specific insights extracted"