Spaces:

Kraft102
/

widgettdc-api

Paused

App Files Files Community

widgettdc-api / apps /backend /src /services /ingestion /VisualAssetExtractor.ts

Kraft102

Update backend source

34367da verified 2 months ago

raw

history blame contribute delete

6.37 kB

	import * as cheerio from 'cheerio';
	import { logger } from '../../utils/logger.js';
	import { getNeo4jVectorStore } from '../../platform/vector/Neo4jVectorStoreAdapter.js';
	import { getLlmService } from '../llm/llmService.js';

	export interface VisualAsset {
	id: string;
	type: 'image' \| 'figure' \| 'chart' \| 'diagram';
	sourceUrl: string;
	localPath?: string;
	description: string;
	tags: string[];
	metadata: Record<string, any>;
	}

	export class VisualAssetExtractor {

	/**
	* Extract visual assets from HTML content
	*/
	async extractFromHtml(html: string, baseUrl: string): Promise<VisualAsset[]> {
	const $ = cheerio.load(html);
	const assets: VisualAsset[] = [];

	// Find images
	$('img').each((_, el) => {
	const src = $(el).attr('src');
	const alt = $(el).attr('alt') \|\| '';

	if (src && !src.includes('icon') && !src.includes('logo') && !src.includes('pixel')) {
	// Filter out small icons/tracking pixels based on keywords for now
	// In production, check image dimensions
	try {
	const absoluteUrl = new URL(src, baseUrl).toString();
	assets.push({
	id: `img-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
	type: 'image',
	sourceUrl: absoluteUrl,
	description: alt,
	tags: ['web-image'], // Initial tag
	metadata: {
	originalAlt: alt,
	pageUrl: baseUrl
	}
	});
	} catch (e) {
	// Invalid URL
	}
	}
	});

	// Find figures (often charts/diagrams)
	$('figure').each((_, el) => {
	const img = $(el).find('img').first();
	const caption = $(el).find('figcaption').text().trim();
	const src = img.attr('src');

	if (src) {
	try {
	const absoluteUrl = new URL(src, baseUrl).toString();
	assets.push({
	id: `fig-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
	type: 'figure',
	sourceUrl: absoluteUrl,
	description: caption \|\| img.attr('alt') \|\| 'Figure',
	tags: ['figure', 'chart'],
	metadata: {
	caption,
	pageUrl: baseUrl
	}
	});
	} catch (e) {
	// Invalid URL
	}
	}
	});

	return assets;
	}

	/**
	* Placeholder for PDF extraction
	*/
	async extractFromPdf(filePath: string): Promise<VisualAsset[]> {
	// TODO: Implement PDF image extraction using 'pdf-lib' or similar
	logger.info(`Would extract images from PDF: ${filePath}`);
	return [];
	}

	/**
	* Placeholder for PPTX extraction
	*/
	async extractFromPptx(filePath: string): Promise<VisualAsset[]> {
	// TODO: Implement PPTX image extraction using 'jszip' (unzipping pptx structure)
	logger.info(`Would extract images from PPTX: ${filePath}`);
	return [];
	}

	/**
	* Analyze and label assets using Vision LLM
	*/
	async analyzeAndLabel(assets: VisualAsset[]): Promise<VisualAsset[]> {
	const llm = getLlmService();

	for (const asset of assets) {
	try {
	// Skip if description is already detailed
	if (asset.description.length > 50) continue;

	// In a real scenario, we would pass the image URL or binary to a Vision model
	// For now, we simulate based on existing metadata/alt text

	const prompt = `Analyze this image metadata and generate 3 relevant tags and a short description.
	Type: ${asset.type}
	Source: ${asset.sourceUrl}
	Current Description: ${asset.description}
	Metadata: ${JSON.stringify(asset.metadata)}`;

	const analysis = await llm.generateContextualResponse(
	"You are an expert Image Analyst. Output JSON with { description: string, tags: string[] }.",
	prompt
	);

	try {
	// Try to parse JSON response (LLM output might be messy)
	const jsonMatch = analysis.match(/\{[\s\S]*\}/);
	if (jsonMatch) {
	const result = JSON.parse(jsonMatch[0]);
	asset.description = result.description \|\| asset.description;
	asset.tags = [...new Set([...asset.tags, ...(result.tags \|\| [])])];
	}
	} catch (e) {
	// Fallback if JSON parse fails
	}
	} catch (error) {
	logger.warn(`Failed to analyze asset ${asset.id}`, error);
	}
	}

	return assets;
	}

	/**
	* Store assets in Vector DB for retrieval
	*/
	async storeAssets(assets: VisualAsset[]): Promise<void> {
	const vectorStore = getNeo4jVectorStore();

	await vectorStore.batchUpsert({
	records: assets.map(asset => ({
	id: asset.id,
	content: `${asset.description}\nTags: ${asset.tags.join(', ')}\nType: ${asset.type}`,
	metadata: {
	...asset.metadata,
	type: 'visual_asset',
	sourceUrl: asset.sourceUrl,
	assetType: asset.type
	},
	namespace: 'assets' // Separate namespace for visuals
	})),
	namespace: 'assets'
	});

	logger.info(`🖼️ Stored ${assets.length} visual assets`);
	}
	}

	export const visualAssetExtractor = new VisualAssetExtractor();