Kraft102's picture
Update backend source
34367da verified
import * as cheerio from 'cheerio';
import { logger } from '../../utils/logger.js';
import { getNeo4jVectorStore } from '../../platform/vector/Neo4jVectorStoreAdapter.js';
import { getLlmService } from '../llm/llmService.js';
export interface VisualAsset {
id: string;
type: 'image' | 'figure' | 'chart' | 'diagram';
sourceUrl: string;
localPath?: string;
description: string;
tags: string[];
metadata: Record<string, any>;
}
export class VisualAssetExtractor {
/**
* Extract visual assets from HTML content
*/
async extractFromHtml(html: string, baseUrl: string): Promise<VisualAsset[]> {
const $ = cheerio.load(html);
const assets: VisualAsset[] = [];
// Find images
$('img').each((_, el) => {
const src = $(el).attr('src');
const alt = $(el).attr('alt') || '';
if (src && !src.includes('icon') && !src.includes('logo') && !src.includes('pixel')) {
// Filter out small icons/tracking pixels based on keywords for now
// In production, check image dimensions
try {
const absoluteUrl = new URL(src, baseUrl).toString();
assets.push({
id: `img-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
type: 'image',
sourceUrl: absoluteUrl,
description: alt,
tags: ['web-image'], // Initial tag
metadata: {
originalAlt: alt,
pageUrl: baseUrl
}
});
} catch (e) {
// Invalid URL
}
}
});
// Find figures (often charts/diagrams)
$('figure').each((_, el) => {
const img = $(el).find('img').first();
const caption = $(el).find('figcaption').text().trim();
const src = img.attr('src');
if (src) {
try {
const absoluteUrl = new URL(src, baseUrl).toString();
assets.push({
id: `fig-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
type: 'figure',
sourceUrl: absoluteUrl,
description: caption || img.attr('alt') || 'Figure',
tags: ['figure', 'chart'],
metadata: {
caption,
pageUrl: baseUrl
}
});
} catch (e) {
// Invalid URL
}
}
});
return assets;
}
/**
* Placeholder for PDF extraction
*/
async extractFromPdf(filePath: string): Promise<VisualAsset[]> {
// TODO: Implement PDF image extraction using 'pdf-lib' or similar
logger.info(`Would extract images from PDF: ${filePath}`);
return [];
}
/**
* Placeholder for PPTX extraction
*/
async extractFromPptx(filePath: string): Promise<VisualAsset[]> {
// TODO: Implement PPTX image extraction using 'jszip' (unzipping pptx structure)
logger.info(`Would extract images from PPTX: ${filePath}`);
return [];
}
/**
* Analyze and label assets using Vision LLM
*/
async analyzeAndLabel(assets: VisualAsset[]): Promise<VisualAsset[]> {
const llm = getLlmService();
for (const asset of assets) {
try {
// Skip if description is already detailed
if (asset.description.length > 50) continue;
// In a real scenario, we would pass the image URL or binary to a Vision model
// For now, we simulate based on existing metadata/alt text
const prompt = `Analyze this image metadata and generate 3 relevant tags and a short description.
Type: ${asset.type}
Source: ${asset.sourceUrl}
Current Description: ${asset.description}
Metadata: ${JSON.stringify(asset.metadata)}`;
const analysis = await llm.generateContextualResponse(
"You are an expert Image Analyst. Output JSON with { description: string, tags: string[] }.",
prompt
);
try {
// Try to parse JSON response (LLM output might be messy)
const jsonMatch = analysis.match(/\{[\s\S]*\}/);
if (jsonMatch) {
const result = JSON.parse(jsonMatch[0]);
asset.description = result.description || asset.description;
asset.tags = [...new Set([...asset.tags, ...(result.tags || [])])];
}
} catch (e) {
// Fallback if JSON parse fails
}
} catch (error) {
logger.warn(`Failed to analyze asset ${asset.id}`, error);
}
}
return assets;
}
/**
* Store assets in Vector DB for retrieval
*/
async storeAssets(assets: VisualAsset[]): Promise<void> {
const vectorStore = getNeo4jVectorStore();
await vectorStore.batchUpsert({
records: assets.map(asset => ({
id: asset.id,
content: `${asset.description}\nTags: ${asset.tags.join(', ')}\nType: ${asset.type}`,
metadata: {
...asset.metadata,
type: 'visual_asset',
sourceUrl: asset.sourceUrl,
assetType: asset.type
},
namespace: 'assets' // Separate namespace for visuals
})),
namespace: 'assets'
});
logger.info(`🖼️ Stored ${assets.length} visual assets`);
}
}
export const visualAssetExtractor = new VisualAssetExtractor();