Spaces:

Agents-MCP-Hackathon
/

MCP_Server_Web2JSON

Running

App Files Files Community

MCP_Server_Web2JSON / app.py

abdo-Mansour

improved the prompt

44fb3b3 11 months ago

raw

history blame

3.58 kB

	import json
	import pandas as pd
	import gradio as gr
	from typing import Dict, Any
	from web2json.preprocessor import BasicPreprocessor
	from web2json.ai_extractor import AIExtractor, GeminiLLMClient
	from web2json.postprocessor import PostProcessor
	from web2json.pipeline import Pipeline
	from pydantic import BaseModel, Field
	import os
	import dotenv

	dotenv.load_dotenv()

	# Define schemas
	class Article(BaseModel):
	title: str = Field(..., description="The title of the article.")
	author: str = Field(..., description="The author of the article.")
	content: str = Field(..., description="The main content of the article.")

	class Product(BaseModel):
	name: str = Field(..., description="The name of the product.")
	description: str = Field(..., description="A detailed description of the product.")
	price: float = Field(..., description="The price of the product.")

	class JobPosting(BaseModel):
	title: str = Field(..., description="The title of the job position.")
	company: str = Field(..., description="The name of the company offering the job.")
	location: str = Field(..., description="The location of the job.")
	description: str = Field(..., description="A detailed description of the job responsibilities.")

	SCHEMA_OPTIONS = {
	"Article": Article,
	"Product": Product,
	"Job Posting": JobPosting,
	}

	# Core processing function

	def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
	if schema_name not in SCHEMA_OPTIONS:
	return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}

	schema = SCHEMA_OPTIONS[schema_name]
	prompt_template = """Extract the following information from the provided content according to the specified schema.

	Content to analyze:
	{content}

	Schema requirements:
	{schema}

	Instructions:
	- Extract only information that is explicitly present in the content
	- Follow the exact structure and data types specified in the schema
	- If a required field cannot be found, indicate this clearly
	- Preserve the original formatting and context where relevant
	- Return the extracted data in the format specified by the schema"""

	# Initialize pipeline components
	preprocessor = BasicPreprocessor(config={'keep_tags': False})
	try:
	llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
	except Exception as e:
	return {"error": f"Failed to initialize LLM client: {str(e)}"}

	ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
	postprocessor = PostProcessor()
	pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)

	try:
	result = pipeline.run(content, is_url, schema)
	print("-"*80)
	print(f"Processed result: {result}")
	return result
	except Exception as e:
	return {"error": f"Processing error: {str(e)}"}

	# Build Gradio Interface
	demo = gr.Interface(
	fn=webpage_to_json,
	inputs=[
	gr.Textbox(label="Content (URL or Raw Text)", lines=10,
	placeholder="Enter URL or paste raw HTML/text here."),
	gr.Checkbox(label="Content is URL?", value=False),
	gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
	label="Select Schema", value="Article")
	],
	outputs=gr.JSON(label="Output JSON"),
	title="Webpage to JSON Converter",
	description="Convert web pages or raw text into structured JSON using customizable schemas."
	)

	if __name__ == "__main__":
	demo.launch(mcp_server=True)