import { extractToolData, normalizeContentToString } from '../utils'; export interface WebScrapeData { url: string | null; urls: string[] | null; success?: boolean; message: string | null; files: string[]; urlCount: number; timestamp?: string; } const parseContent = (content: any): any => { if (typeof content === 'string') { try { return JSON.parse(content); } catch (e) { return content; } } return content; }; const extractFromNewFormat = (content: any): { url: string | null; urls: string[] | null; success?: boolean; message: string | null; files: string[]; urlCount: number; timestamp?: string; } => { const parsedContent = parseContent(content); if (!parsedContent || typeof parsedContent !== 'object') { return { url: null, urls: null, success: undefined, message: null, files: [], urlCount: 0, timestamp: undefined }; } if ('tool_execution' in parsedContent && typeof parsedContent.tool_execution === 'object') { const toolExecution = parsedContent.tool_execution; const args = toolExecution.arguments || {}; let parsedOutput = toolExecution.result?.output; if (typeof parsedOutput === 'string') { try { parsedOutput = JSON.parse(parsedOutput); } catch (e) { } } let urls: string[] | null = null; let url: string | null = null; if (args.urls) { if (typeof args.urls === 'string') { urls = args.urls.split(',').map((u: string) => u.trim()); url = urls?.[0] || null; } else if (Array.isArray(args.urls)) { urls = args.urls; url = urls?.[0] || null; } } let files: string[] = []; let urlCount = 0; let message = ''; if (typeof toolExecution.result?.output === 'string') { const outputStr = toolExecution.result.output; message = outputStr; const successMatch = outputStr.match(/Successfully scraped (?:all )?(\d+) URLs?/); urlCount = successMatch ? parseInt(successMatch[1]) : 0; const fileMatches = outputStr.match(/- ([^\n]+\.json)/g); files = fileMatches ? fileMatches.map((match: string) => match.replace('- ', '')) : []; } const extractedData = { url, urls, success: toolExecution.result?.success, message: message || parsedContent.summary || null, files, urlCount, timestamp: toolExecution.execution_details?.timestamp }; console.log('WebScrapeToolView: Extracted from new format:', { url: extractedData.url, urlCount: extractedData.urlCount, fileCount: extractedData.files.length, success: extractedData.success }); return extractedData; } if ('role' in parsedContent && 'content' in parsedContent) { return extractFromNewFormat(parsedContent.content); } return { url: null, urls: null, success: undefined, message: null, files: [], urlCount: 0, timestamp: undefined }; }; const extractScrapeUrl = (content: string | object | undefined | null): string | null => { const contentStr = normalizeContentToString(content); if (!contentStr) return null; const urlMatch = contentStr.match(/]*\s+urls=["']([^"']+)["']/); if (urlMatch) { return urlMatch[1]; } const httpMatch = contentStr.match(/https?:\/\/[^\s<>"]+/); return httpMatch ? httpMatch[0] : null; }; const extractScrapeResults = (content: string | object | undefined | null): { success: boolean; message: string; files: string[]; urlCount: number; } => { const contentStr = normalizeContentToString(content); if (!contentStr) return { success: false, message: 'No output received', files: [], urlCount: 0 }; const outputMatch = contentStr.match(/output='([^']+)'/); const cleanContent = outputMatch ? outputMatch[1].replace(/\\n/g, '\n') : contentStr; const successMatch = cleanContent.match(/Successfully scraped (?:all )?(\d+) URLs?/); const urlCount = successMatch ? parseInt(successMatch[1]) : 0; const fileMatches = cleanContent.match(/- ([^\n]+\.json)/g); const files = fileMatches ? fileMatches.map(match => match.replace('- ', '')) : []; const success = cleanContent.includes('Successfully scraped'); return { success, message: cleanContent, files, urlCount }; }; const extractFromLegacyFormat = (content: any): { url: string | null; urls: string[] | null; success?: boolean; message: string | null; files: string[]; urlCount: number; } => { const toolData = extractToolData(content); if (toolData.toolResult && toolData.arguments) { console.log('WebScrapeToolView: Extracted from legacy format (extractToolData):', { url: toolData.url }); return { url: toolData.url || null, urls: toolData.url ? [toolData.url] : null, success: undefined, message: null, files: [], urlCount: 0 }; } const contentStr = normalizeContentToString(content); if (!contentStr) { return { url: null, urls: null, success: undefined, message: null, files: [], urlCount: 0 }; } const url = extractScrapeUrl(contentStr); const results = extractScrapeResults(contentStr); console.log('WebScrapeToolView: Extracted from legacy format (manual parsing):', { url, fileCount: results.files.length, urlCount: results.urlCount }); return { url, urls: url ? [url] : null, success: results.success, message: results.message, files: results.files, urlCount: results.urlCount }; }; export function extractWebScrapeData( assistantContent: any, toolContent: any, isSuccess: boolean, toolTimestamp?: string, assistantTimestamp?: string ): { url: string | null; urls: string[] | null; success: boolean; message: string | null; files: string[]; urlCount: number; actualIsSuccess: boolean; actualToolTimestamp?: string; actualAssistantTimestamp?: string; } { let url: string | null = null; let urls: string[] | null = null; let success = false; let message: string | null = null; let files: string[] = []; let urlCount = 0; let actualIsSuccess = isSuccess; let actualToolTimestamp = toolTimestamp; let actualAssistantTimestamp = assistantTimestamp; const assistantNewFormat = extractFromNewFormat(assistantContent); const toolNewFormat = extractFromNewFormat(toolContent); console.log('WebScrapeToolView: Format detection results:', { assistantNewFormat: { hasUrl: !!assistantNewFormat.url, fileCount: assistantNewFormat.files.length, urlCount: assistantNewFormat.urlCount }, toolNewFormat: { hasUrl: !!toolNewFormat.url, fileCount: toolNewFormat.files.length, urlCount: toolNewFormat.urlCount } }); if (assistantNewFormat.url || assistantNewFormat.files.length > 0 || assistantNewFormat.urlCount > 0) { url = assistantNewFormat.url; urls = assistantNewFormat.urls; success = assistantNewFormat.success || false; message = assistantNewFormat.message; files = assistantNewFormat.files; urlCount = assistantNewFormat.urlCount; if (assistantNewFormat.success !== undefined) { actualIsSuccess = assistantNewFormat.success; } if (assistantNewFormat.timestamp) { actualAssistantTimestamp = assistantNewFormat.timestamp; } console.log('WebScrapeToolView: Using assistant new format data'); } else if (toolNewFormat.url || toolNewFormat.files.length > 0 || toolNewFormat.urlCount > 0) { url = toolNewFormat.url; urls = toolNewFormat.urls; success = toolNewFormat.success || false; message = toolNewFormat.message; files = toolNewFormat.files; urlCount = toolNewFormat.urlCount; if (toolNewFormat.success !== undefined) { actualIsSuccess = toolNewFormat.success; } if (toolNewFormat.timestamp) { actualToolTimestamp = toolNewFormat.timestamp; } console.log('WebScrapeToolView: Using tool new format data'); } else { const assistantLegacy = extractFromLegacyFormat(assistantContent); const toolLegacy = extractFromLegacyFormat(toolContent); url = assistantLegacy.url || toolLegacy.url; urls = assistantLegacy.urls || toolLegacy.urls; success = assistantLegacy.success || toolLegacy.success || false; message = assistantLegacy.message || toolLegacy.message; files = assistantLegacy.files.length > 0 ? assistantLegacy.files : toolLegacy.files; urlCount = assistantLegacy.urlCount > 0 ? assistantLegacy.urlCount : toolLegacy.urlCount; console.log('WebScrapeToolView: Using legacy format data:', { url, fileCount: files.length, urlCount }); } console.log('WebScrapeToolView: Final extracted data:', { url, fileCount: files.length, urlCount, actualIsSuccess }); return { url, urls, success, message, files, urlCount, actualIsSuccess, actualToolTimestamp, actualAssistantTimestamp }; }