| import json |
| import pandas as pd |
| import gradio as gr |
| from typing import Dict, Any |
| from web2json.preprocessor import BasicPreprocessor |
| from web2json.ai_extractor import AIExtractor, GeminiLLMClient |
| from web2json.postprocessor import PostProcessor |
| from web2json.pipeline import Pipeline |
| from pydantic import BaseModel, Field |
| import os |
| import dotenv |
|
|
| dotenv.load_dotenv() |
|
|
| |
| class Article(BaseModel): |
| title: str = Field(..., description="The title of the article.") |
| author: str = Field(..., description="The author of the article.") |
| content: str = Field(..., description="The main content of the article.") |
|
|
| class Product(BaseModel): |
| name: str = Field(..., description="The name of the product.") |
| description: str = Field(..., description="A detailed description of the product.") |
| price: float = Field(..., description="The price of the product.") |
|
|
| class JobPosting(BaseModel): |
| title: str = Field(..., description="The title of the job position.") |
| company: str = Field(..., description="The name of the company offering the job.") |
| location: str = Field(..., description="The location of the job.") |
| description: str = Field(..., description="A detailed description of the job responsibilities.") |
|
|
| SCHEMA_OPTIONS = { |
| "Article": Article, |
| "Product": Product, |
| "Job Posting": JobPosting, |
| } |
|
|
| |
|
|
| def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]: |
| if schema_name not in SCHEMA_OPTIONS: |
| return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"} |
|
|
| schema = SCHEMA_OPTIONS[schema_name] |
| prompt_template = """Extract the following information from the provided content according to the specified schema. |
| |
| Content to analyze: |
| {content} |
| |
| Schema requirements: |
| {schema} |
| |
| Instructions: |
| - Extract only information that is explicitly present in the content |
| - Follow the exact structure and data types specified in the schema |
| - If a required field cannot be found, indicate this clearly |
| - Preserve the original formatting and context where relevant |
| - Return the extracted data in the format specified by the schema""" |
|
|
| |
| preprocessor = BasicPreprocessor(config={'keep_tags': False}) |
| try: |
| llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')}) |
| except Exception as e: |
| return {"error": f"Failed to initialize LLM client: {str(e)}"} |
|
|
| ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template) |
| postprocessor = PostProcessor() |
| pipeline = Pipeline(preprocessor, ai_extractor, postprocessor) |
|
|
| try: |
| result = pipeline.run(content, is_url, schema) |
| print("-"*80) |
| print(f"Processed result: {result}") |
| return result |
| except Exception as e: |
| return {"error": f"Processing error: {str(e)}"} |
|
|
| |
| demo = gr.Interface( |
| fn=webpage_to_json, |
| inputs=[ |
| gr.Textbox(label="Content (URL or Raw Text)", lines=10, |
| placeholder="Enter URL or paste raw HTML/text here."), |
| gr.Checkbox(label="Content is URL?", value=False), |
| gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()), |
| label="Select Schema", value="Article") |
| ], |
| outputs=gr.JSON(label="Output JSON"), |
| title="Webpage to JSON Converter", |
| description="Convert web pages or raw text into structured JSON using customizable schemas." |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(mcp_server=True) |
|
|