Spaces:

lenzcom
/

Email

Sleeping

App Files Files Community

Email / tutorial /02-composition /02-parsers /exercises /24-multi-parser-pipeline.js

lenzcom's picture

Upload folder using huggingface_hub

e706de2 verified 3 days ago

history blame contribute delete

16.9 kB

	/**
	* Exercise 24: Multi-Parser Content Pipeline
	*
	* Difficulty: ⭐⭐⭐⭐ (Expert)
	*
	* Goal: Build a robust content processing pipeline using multiple parsers with fallbacks
	*
	* In this exercise, you'll:
	* 1. Combine multiple parser types in one system
	* 2. Implement fallback strategies when parsing fails
	* 3. Use RegexOutputParser for custom extraction
	* 4. Build a production-ready error handling system
	* 5. Create a complete content analysis pipeline
	*
	* Skills practiced:
	* - Multi-parser orchestration
	* - Fallback parsing strategies
	* - Regex-based extraction
	* - Error handling and recovery
	* - Building robust production pipelines
	*/

	import {
	Runnable,
	PromptTemplate,
	StructuredOutputParser,
	ListOutputParser,
	RegexOutputParser
	} from '../../../../src/index.js';
	import {LlamaCppLLM} from '../../../../src/llm/llama-cpp-llm.js';
	import {QwenChatWrapper} from "node-llama-cpp";

	// Sample content to process
	const CONTENT_SAMPLES = [
	{
	text: "Breaking: Stock market hits record high! NASDAQ up 2.5%, S&P 500 gains 1.8%. Tech sector leads with Apple +3.2%, Microsoft +2.9%. Analysts predict continued growth.",
	type: "news"
	},
	{
	text: "Recipe: Chocolate Chip Cookies. Ingredients: 2 cups flour, 1 cup butter, 1 cup sugar, 2 eggs, 1 tsp vanilla, 2 cups chocolate chips. Bake at 350°F for 12 minutes.",
	type: "recipe"
	},
	{
	text: "Product Review: The XPhone 15 Pro (Score: 8.5/10) - Great camera, long battery life, but expensive at $1,199. Pros: Display, Performance. Cons: Price, Weight.",
	type: "review"
	}
	];

	// ============================================================================
	// TODO 1: Create News Article Parser (Structured)
	// ============================================================================

	/**
	* Extract structured data from news articles
	*/
	async function createNewsParser() {
	// TODO: Create StructuredOutputParser
	// Schema:
	// - headline: string
	// - category: string, enum: ["business", "technology", "politics", "sports", "other"]
	// - sentiment: string, enum: ["positive", "negative", "neutral"]
	// - entities: array (companies, people, places mentioned)
	// - marketData: array (any numbers with context like "NASDAQ up 2.5%")
	const parser = null;

	// TODO: Create prompt
	const prompt = null;

	const llm = new LlamaCppLLM({
	modelPath: './models/your-model.gguf',
	temperature: 0.1
	});

	const chain = prompt.pipe(llm).pipe(parser);

	return chain;
	}

	// ============================================================================
	// TODO 2: Create Recipe Parser (Regex + List)
	// ============================================================================

	/**
	* Extract recipe components using regex and list parsers
	*/
	async function createRecipeParser() {
	// TODO: Create a Runnable that:
	// 1. Extracts recipe name using RegexOutputParser
	// 2. Extracts ingredients list using ListOutputParser
	// 3. Extracts temperature and time using RegexOutputParser
	// 4. Returns combined object

	// Hint: You'll need to create multiple chains and combine their results

	// RegexOutputParser for name:
	// Pattern: /Recipe:\s*(.+?)\./
	const nameParser = null;

	// RegexOutputParser for temp/time:
	// Pattern: /(\d+)°F.?(\d+)\sminutes/
	const cookingParser = null;

	// ListOutputParser for ingredients
	const ingredientsParser = null;

	// TODO: Create a custom Runnable that orchestrates all parsers
	class RecipeParserRunnable extends Runnable {
	async _call(input, config) {
	const text = input.text;

	// TODO: Extract name
	// TODO: Extract ingredients
	// TODO: Extract cooking details

	// Return combined result
	return {
	name: null,
	ingredients: null,
	temperature: null,
	time: null
	};
	}
	}

	return new RecipeParserRunnable();
	}

	// ============================================================================
	// TODO 3: Create Review Parser with Fallback
	// ============================================================================

	/**
	* Parse product reviews with fallback strategy
	* Try structured parser first, fall back to regex if it fails
	*/
	async function createReviewParser() {
	const llm = new LlamaCppLLM({
	modelPath: './models/your-model.gguf',
	temperature: 0.1
	});

	// TODO: Primary parser - StructuredOutputParser
	const structuredParser = null;
	// Schema: productName, score (number), pros (array), cons (array), price

	// TODO: Fallback parser - RegexOutputParser
	const regexParser = null;
	// Pattern to extract: Product name, Score, Price
	// Example: /(\w+.?)\s\(Score:\s([\d.]+).?\$(\d+)/

	// TODO: Create a Runnable with fallback logic
	class ReviewParserWithFallback extends Runnable {
	async _call(input, config) {
	const text = input.text;

	try {
	// TODO: Try structured parser first
	const prompt = new PromptTemplate({
	template: `Extract review data: {text}\n\n{format_instructions}`,
	inputVariables: ["text"],
	partialVariables: {
	format_instructions: structuredParser.getFormatInstructions()
	}
	});

	const chain = prompt.pipe(llm).pipe(structuredParser);
	const result = await chain.invoke({text});

	return {
	method: 'structured',
	data: result
	};
	} catch (error) {
	console.warn('Structured parsing failed, using regex fallback');

	try {
	// TODO: Fall back to regex parser
	const result = await regexParser.parse(text);

	return {
	method: 'regex',
	data: result
	};
	} catch (regexError) {
	// TODO: Final fallback - return basic string parsing
	console.warn('Regex parsing failed, using basic extraction');

	return {
	method: 'basic',
	data: {
	text: text,
	error: 'Could not parse structured data'
	}
	};
	}
	}
	}
	}

	return new ReviewParserWithFallback();
	}

	// ============================================================================
	// TODO 4: Create Content Router
	// ============================================================================

	/**
	* Route content to appropriate parser based on content type
	*/
	class ContentRouter extends Runnable {
	constructor(parsers) {
	super();
	this.parsers = parsers; // { news: parser, recipe: parser, review: parser }
	}

	async _call(input, config) {
	const {text, type} = input;

	// TODO: Route to appropriate parser based on type
	const parser = this.parsers[type];

	if (!parser) {
	throw new Error(`No parser for content type: ${type}`);
	}

	// TODO: Parse content
	const result = await parser.invoke({text}, config);

	return {
	type: type,
	parsed: result,
	originalText: text
	};
	}
	}

	// ============================================================================
	// TODO 5: Build Complete Pipeline with Error Handling
	// ============================================================================

	async function buildContentPipeline() {
	console.log('=== Exercise 24: Multi-Parser Content Pipeline ===\n');

	// TODO: Create all parsers
	const newsParser = null;
	const recipeParser = null;
	const reviewParser = null;

	// TODO: Create content router
	const router = null; // new ContentRouter({ news: newsParser, recipe: recipeParser, review: reviewParser })

	console.log('Processing content samples...\n');

	const results = [];

	// TODO: Process each content sample
	for (let i = 0; i < CONTENT_SAMPLES.length; i++) {
	const sample = CONTENT_SAMPLES[i];

	console.log('='.repeat(70));
	console.log(`SAMPLE ${i + 1}: ${sample.type.toUpperCase()}`);
	console.log('='.repeat(70));
	console.log(`Text: ${sample.text}\n`);

	try {
	// TODO: Route and parse
	const result = null;

	console.log('Parsing Result:');
	console.log(JSON.stringify(result, null, 2));

	results.push({
	success: true,
	data: result
	});
	} catch (error) {
	console.error(`Error: ${error.message}`);

	results.push({
	success: false,
	error: error.message,
	sample: sample
	});
	}

	console.log();
	}

	// TODO: Generate summary report
	console.log('='.repeat(70));
	console.log('PROCESSING SUMMARY');
	console.log('='.repeat(70));

	const successful = results.filter(r => r.success).length;
	const failed = results.filter(r => !r.success).length;

	console.log(`Total Samples: ${results.length}`);
	console.log(`Successful: ${successful}`);
	console.log(`Failed: ${failed}`);
	console.log(`Success Rate: ${((successful / results.length) * 100).toFixed(1)}%`);

	console.log('\n✓ Exercise 4 Complete!');

	return {newsParser, recipeParser, reviewParser, router, results};
	}

	// Run the exercise
	buildContentPipeline()
	.then(runTests)
	.catch(console.error);

	// ============================================================================
	// AUTOMATED TESTS
	// ============================================================================

	async function runTests(context) {
	const {newsParser, recipeParser, reviewParser, router, results} = context;

	console.log('\n' + '='.repeat(60));
	console.log('RUNNING AUTOMATED TESTS');
	console.log('='.repeat(60) + '\n');

	const assert = (await import('assert')).default;
	let passed = 0;
	let failed = 0;

	function test(name, fn) {
	try {
	fn();
	passed++;
	console.log(`✅ ${name}`);
	} catch (error) {
	failed++;
	console.error(`❌ ${name}`);
	console.error(` ${error.message}\n`);
	}
	}

	// Test 1: All parsers created
	test('News parser created', () => {
	assert(newsParser !== null, 'Create newsParser');
	assert(newsParser instanceof Runnable, 'Should be Runnable');
	});

	test('Recipe parser created', () => {
	assert(recipeParser !== null, 'Create recipeParser');
	});

	test('Review parser created', () => {
	assert(reviewParser !== null, 'Create reviewParser');
	});

	test('Router created', () => {
	assert(router !== null, 'Create ContentRouter');
	assert(router instanceof ContentRouter, 'Should be ContentRouter instance');
	});

	// Test 2: News parser validation
	test('News parser extracts structured data', async () => {
	const result = await newsParser.invoke({
	text: "Tech stocks surge: Apple up 5%, Google gains 3%"
	});
	assert(typeof result === 'object', 'Should return object');
	assert('headline' in result \|\| 'category' in result, 'Should have headline or category');
	});

	// Test 3: Recipe parser validation
	test('Recipe parser extracts components', async () => {
	const result = await recipeParser.invoke({
	text: "Recipe: Pasta. Ingredients: noodles, sauce. Bake at 400°F for 20 minutes."
	});
	assert(typeof result === 'object', 'Should return object');
	assert('name' in result \|\| 'ingredients' in result, 'Should have recipe components');
	});

	// Test 4: Review parser with fallback
	test('Review parser handles well-formed input', async () => {
	const result = await reviewParser.invoke({
	text: "Product XYZ (Score: 8/10) costs $99. Pros: Good. Cons: Expensive."
	});
	assert(result.method, 'Should indicate parsing method used');
	assert(result.data, 'Should have data');
	});

	test('Review parser falls back gracefully', async () => {
	const result = await reviewParser.invoke({
	text: "This is malformed data that won't parse well"
	});
	// Should not throw, should fall back
	assert(result !== null, 'Should return something even on bad input');
	assert(result.method, 'Should indicate which method was used');
	});

	// Test 5: Router functionality
	test('Router routes to correct parser', async () => {
	const newsResult = await router.invoke({
	text: "Breaking news story",
	type: "news"
	});
	assert(newsResult.type === 'news', 'Should preserve content type');
	assert(newsResult.parsed, 'Should have parsed data');
	});

	// Test 6: End-to-end pipeline
	test('Pipeline processed all samples', () => {
	assert(results.length === CONTENT_SAMPLES.length, 'Should process all samples');
	});

	test('Pipeline has reasonable success rate', () => {
	const successRate = results.filter(r => r.success).length / results.length;
	assert(successRate >= 0.5, 'Should successfully parse at least 50% of samples');
	});

	// Test 7: Error handling
	test('Pipeline handles invalid content type', async () => {
	try {
	await router.invoke({
	text: "Some text",
	type: "invalid_type"
	});
	assert(false, 'Should throw error for invalid type');
	} catch (error) {
	assert(true, 'Correctly throws error for invalid type');
	}
	});

	// Summary
	console.log('\n' + '='.repeat(60));
	console.log('TEST SUMMARY');
	console.log('='.repeat(60));
	console.log(`Total: ${passed + failed}`);
	console.log(`✅ Passed: ${passed}`);
	console.log(`❌ Failed: ${failed}`);
	console.log('='.repeat(60));

	if (failed === 0) {
	console.log('\n🎉 All tests passed! You are a parser master!\n');
	console.log('📚 What you mastered:');
	console.log(' • Orchestrating multiple parser types');
	console.log(' • Implementing fallback strategies');
	console.log(' • Using RegexOutputParser for custom patterns');
	console.log(' • Building robust error handling');
	console.log(' • Creating production-ready pipelines');
	console.log(' • Routing content to appropriate parsers');
	console.log(' • Combining structured and pattern-based extraction\n');
	console.log('🚀 You are ready for production parser systems!');
	} else {
	console.log('\n⚠️ Some tests failed. Review the advanced patterns.\n');
	}
	}

	/**
	* HINTS:
	*
	* 1. RegexOutputParser usage:
	* new RegexOutputParser({
	* regex: /Pattern: (.+), Value: (\d+)/,
	* outputKeys: ["pattern", "value"]
	* })
	*
	* 2. Fallback strategy:
	* try {
	* return await primaryParser.parse(text);
	* } catch (error) {
	* return await fallbackParser.parse(text);
	* }
	*
	* 3. Combining multiple parsers:
	* - Create separate chains for each
	* - Call them in sequence or parallel
	* - Combine results into single object
	*
	* 4. Custom Runnable for orchestration:
	* class MyParser extends Runnable {
	* async _call(input, config) {
	* const result1 = await parser1.parse(...);
	* const result2 = await parser2.parse(...);
	* return { result1, result2 };
	* }
	* }
	*
	* 5. Regex tips:
	* - Use () groups to capture data
	* - Test patterns at regex101.com
	* - Use \s for whitespace, \d for digits
	* - Make patterns flexible with .*?
	*
	* 6. Production patterns:
	* - Always have fallbacks
	* - Log which method succeeded
	* - Handle partial failures gracefully
	* - Return metadata about parsing method
	*/