Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import json | |
| import torch | |
| import os | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import spaces | |
| title = """ # 🙋🏻♂️Welcome to 🌟Tonic's 🌊 Osmosis Structure - Text to JSON Converter | |
| """ | |
| description = """ | |
| Convert unstructured text into well-formatted JSON using the Osmosis Structure 0.6B model. | |
| This model is specifically trained for structured data extraction and format conversion. | |
| ### ℹ️ About Osmosis Structure | |
| - **Model**: Osmosis Structure 0.6B parameters | |
| - **Architecture**: Qwen3 (specialized for structured data) | |
| - **Purpose**: Converting unstructured text to structured JSON format | |
| - **Optimizations**: Fine-tuned for data extraction and format conversion tasks | |
| The model automatically identifies key information in your text and organizes it into logical JSON structures. | |
| """ | |
| joinus = """ | |
| ## Join us : | |
| 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [MultiTonic](https://github.com/MultiTonic)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗 | |
| """ | |
| # Model configuration | |
| MODEL_NAME = "osmosis-ai/Osmosis-Structure-0.6B" | |
| # Global variables to store the model and tokenizer | |
| model = None | |
| tokenizer = None | |
| def load_model(): | |
| """Load the Osmosis Structure model and tokenizer with HF token for gated repos. | |
| This function initializes the global model and tokenizer variables by loading them from Hugging Face. | |
| It handles authentication using the HF_KEY environment variable and provides helpful error messages | |
| for common issues like authentication failures or model not found errors. | |
| Returns: | |
| bool: True if model and tokenizer were loaded successfully, False otherwise. | |
| Example: | |
| >>> success = load_model() | |
| >>> if success: | |
| ... print("Model loaded successfully!") | |
| ... else: | |
| ... print("Failed to load model") | |
| """ | |
| global model, tokenizer | |
| try: | |
| print("Loading Osmosis Structure model...") | |
| # Get HF token from environment variables | |
| hf_token = os.environ.get("HF_KEY") | |
| if not hf_token: | |
| print("⚠️ Warning: HF_KEY not found in environment variables") | |
| print("Attempting to load without token...") | |
| hf_token = None | |
| else: | |
| print("✅ HF token found, accessing gated repository...") | |
| # Load tokenizer with token | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True, | |
| token=hf_token | |
| ) | |
| print("Loading model...") | |
| # Load model with token | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| device_map="auto" if torch.cuda.is_available() else None, | |
| trust_remote_code=True, | |
| token=hf_token | |
| ) | |
| print("✅ Osmosis Structure model loaded successfully!") | |
| return True | |
| except Exception as e: | |
| error_msg = f"❌ Error loading model: {e}" | |
| print(error_msg) | |
| # Provide helpful error messages for common issues | |
| if "401" in str(e) or "authentication" in str(e).lower(): | |
| print("💡 This appears to be an authentication error.") | |
| print("Please ensure:") | |
| print("1. HF_KEY is set correctly in your Space secrets") | |
| print("2. Your token has access to the gated repository") | |
| print("3. You have accepted the model's license agreement") | |
| elif "404" in str(e) or "not found" in str(e).lower(): | |
| print("💡 Model repository not found.") | |
| print("Please check if the model name is correct and accessible") | |
| return False | |
| def text_to_json(input_text, schema_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=20): | |
| """Convert plain text to structured JSON using Osmosis Structure model. | |
| This function takes unstructured text and optionally a JSON schema, then uses the Osmosis Structure | |
| model to convert it into well-formatted JSON. The output will follow the provided schema if one is | |
| given, otherwise it will create a logical structure based on the input text. | |
| Args: | |
| input_text (str): The unstructured text to convert to JSON. | |
| schema_text (str): Optional JSON schema that defines the desired output structure. | |
| max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 512. | |
| temperature (float, optional): Controls randomness in generation. Defaults to 0.6. | |
| top_p (float, optional): Nucleus sampling parameter. Defaults to 0.95. | |
| top_k (int, optional): Number of highest probability tokens to consider. Defaults to 20. | |
| Returns: | |
| str: A JSON string containing the structured data, or an error message if something went wrong. | |
| Example: | |
| >>> input_text = "The conference will be held on June 10-12, 2024 at the Grand Hotel." | |
| >>> schema = '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}}}' | |
| >>> result = text_to_json(input_text, schema) | |
| >>> print(result) | |
| { | |
| "event_start_date": "2024-06-10" | |
| } | |
| """ | |
| global model, tokenizer | |
| if model is None or tokenizer is None: | |
| return "❌ Model not loaded. Please check the console for loading errors." | |
| try: | |
| # Create a structured prompt for JSON conversion | |
| system_prompt = "You are a helpful assistant that converts unstructured text into well-formatted JSON. Extract key information and organize it into a logical, structured format. Always respond with valid JSON." | |
| if schema_text and schema_text.strip(): | |
| system_prompt = f"You are a helpful assistant that understands and translates text to JSON format according to the following schema. {schema_text}" | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": system_prompt | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"Convert this text to JSON format:\n\n{input_text}" | |
| } | |
| ] | |
| # Apply chat template | |
| formatted_prompt = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| # Tokenize the input | |
| inputs = tokenizer( | |
| formatted_prompt, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=2048 | |
| ) | |
| # Move to device if using GPU | |
| if torch.cuda.is_available(): | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| # Generation parameters based on model config | |
| generation_config = { | |
| "max_new_tokens": max_tokens, | |
| "temperature": temperature, | |
| "top_p": top_p, | |
| "top_k": top_k, | |
| "do_sample": True, | |
| "pad_token_id": tokenizer.pad_token_id, | |
| "eos_token_id": tokenizer.eos_token_id, | |
| "repetition_penalty": 1.1, | |
| } | |
| # Generate response | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| **generation_config | |
| ) | |
| # Decode the response | |
| generated_tokens = outputs[0][len(inputs["input_ids"][0]):] | |
| generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
| # Clean up the response | |
| generated_text = generated_text.strip() | |
| # Try to extract JSON from the response | |
| json_start = generated_text.find('{') | |
| json_end = generated_text.rfind('}') | |
| if json_start != -1 and json_end != -1 and json_end > json_start: | |
| json_text = generated_text[json_start:json_end+1] | |
| else: | |
| # If no clear JSON boundaries, try to clean the whole response | |
| json_text = generated_text | |
| # Remove common prefixes | |
| prefixes_to_remove = ["```json", "```", "Here's the JSON:", "JSON:", "```json\n"] | |
| for prefix in prefixes_to_remove: | |
| if json_text.startswith(prefix): | |
| json_text = json_text[len(prefix):].strip() | |
| # Remove common suffixes | |
| suffixes_to_remove = ["```", "\n```"] | |
| for suffix in suffixes_to_remove: | |
| if json_text.endswith(suffix): | |
| json_text = json_text[:-len(suffix)].strip() | |
| # Validate and format JSON | |
| try: | |
| parsed_json = json.loads(json_text) | |
| return json.dumps(parsed_json, indent=2, ensure_ascii=False) | |
| except json.JSONDecodeError: | |
| # If still not valid JSON, return the cleaned text with a note | |
| return f"Generated response (may need manual cleanup):\n\n{json_text}" | |
| except Exception as e: | |
| return f"❌ Error generating JSON: {str(e)}" | |
| def create_demo(): | |
| """Create and configure the Gradio demo interface. | |
| This function sets up the Gradio interface with all necessary components: | |
| - Input text area for unstructured text | |
| - Schema input area for JSON schema | |
| - Generation settings controls | |
| - Output display area | |
| - Example inputs with corresponding schemas | |
| Returns: | |
| gr.Blocks: A configured Gradio interface ready to be launched. | |
| Example: | |
| >>> demo = create_demo() | |
| >>> demo.launch() | |
| """ | |
| # Fixed: Remove duplicate with gr.Blocks declaration | |
| with gr.Blocks( | |
| title=title, | |
| theme=gr.themes.Monochrome(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| """ | |
| ) as demo: | |
| # Header section | |
| gr.Markdown(title) | |
| # Info section | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown(description) | |
| with gr.Column(scale=1): | |
| gr.Markdown(joinus) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_text = gr.Textbox( | |
| label="📝 Input Text", | |
| placeholder="Enter your unstructured text here...\n\nExample: 'The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact info@conference.com for questions.'", | |
| lines=8, | |
| max_lines=15 | |
| ) | |
| schema_text = gr.Textbox( | |
| label="📋 JSON Schema (Optional)", | |
| placeholder="Enter your JSON schema here...\n\nExample: {\"type\": \"object\", \"properties\": {\"event_start_date\": {\"type\": \"string\", \"format\": \"date\"}, \"event_end_date\": {\"type\": \"string\", \"format\": \"date\"}, \"location\": {\"type\": \"string\"}, \"registration_fees\": {\"type\": \"object\", \"properties\": {\"early_bird_price\": {\"type\": \"number\"}, \"regular_price\": {\"type\": \"number\"}, \"early_bird_deadline\": {\"type\": \"string\", \"format\": \"date\"}}}, \"contact_email\": {\"type\": \"string\"}}}", | |
| lines=8, | |
| max_lines=15 | |
| ) | |
| with gr.Accordion("⚙️ Generation Settings", open=False): | |
| max_tokens = gr.Slider( | |
| minimum=50, | |
| maximum=1000, | |
| value=512, | |
| step=10, | |
| label="Max Tokens", | |
| info="Maximum number of tokens to generate" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.6, | |
| step=0.1, | |
| label="Temperature", | |
| info="Controls randomness (lower = more focused)" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p", | |
| info="Nucleus sampling parameter" | |
| ) | |
| top_k = gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| value=20, | |
| step=1, | |
| label="Top-k", | |
| info="Limits vocabulary for generation" | |
| ) | |
| convert_btn = gr.Button( | |
| "🔄 Convert to JSON", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=1): | |
| output_json = gr.Textbox( | |
| label="📋 Generated JSON", | |
| lines=15, | |
| max_lines=20, | |
| interactive=False, | |
| show_copy_button=True | |
| ) | |
| # Examples section | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact info@conference.com for questions.", | |
| '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "registration_fees": {"type": "object", "properties": {"early_bird_price": {"type": "number"}, "regular_price": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "contact_email": {"type": "string"}}}' | |
| ], | |
| [ | |
| "The workshop is scheduled for March 15-16, 2024 at Tech Hub in Seattle. Early bird tickets cost $299 until February 15, after which regular tickets will be $399. For inquiries, email workshop@techhub.com", | |
| '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "registration_fees": {"type": "object", "properties": {"early_bird_price": {"type": "number"}, "regular_price": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "contact_email": {"type": "string"}}}' | |
| ], | |
| [ | |
| "Product: Wireless Headphones Model XYZ-100. Price: $199.99. Features: Bluetooth 5.0, 30-hour battery, noise cancellation, wireless charging case. Colors available: Black, White, Blue. Warranty: 2 years. Rating: 4.5/5 stars (324 reviews).", | |
| '{"type": "object", "properties": {"product_name": {"type": "string"}, "price": {"type": "number"}, "features": {"type": "array", "items": {"type": "string"}}, "colors": {"type": "array", "items": {"type": "string"}}, "warranty_years": {"type": "number"}, "rating": {"type": "object", "properties": {"score": {"type": "number"}, "reviews": {"type": "number"}}}}}' | |
| ], | |
| [ | |
| "The summer festival runs from July 1-5, 2024 at Central Park. VIP passes are $150 until June 1, then $200. General admission is $75 early bird (until June 15) and $100 regular. Contact tickets@summerfest.com", | |
| '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "ticket_prices": {"type": "object", "properties": {"vip": {"type": "object", "properties": {"early_bird": {"type": "number"}, "regular": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "general": {"type": "object", "properties": {"early_bird": {"type": "number"}, "regular": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}}}, "contact_email": {"type": "string"}}}' | |
| ] | |
| ], | |
| inputs=[input_text, schema_text], | |
| label="Click on any example to try it" | |
| ) | |
| # Event handlers | |
| convert_btn.click( | |
| fn=text_to_json, | |
| inputs=[input_text, schema_text, max_tokens, temperature, top_p, top_k], | |
| outputs=output_json, | |
| show_progress=True | |
| ) | |
| # Allow Enter key to trigger conversion | |
| input_text.submit( | |
| fn=text_to_json, | |
| inputs=[input_text, schema_text, max_tokens, temperature, top_p, top_k], | |
| outputs=output_json, | |
| show_progress=True | |
| ) | |
| return demo | |
| # Initialize the demo | |
| if __name__ == "__main__": | |
| print("🌊 Initializing Osmosis Structure Demo...") | |
| # Check HF token availability | |
| hf_token = os.environ.get("HF_KEY") | |
| if hf_token: | |
| print("✅ HF_KEY found in environment") | |
| else: | |
| print("⚠️ HF_KEY not found - this may cause issues with gated repositories") | |
| # Load model at startup | |
| if load_model(): | |
| print("🚀 Creating Gradio interface...") | |
| demo = create_demo() | |
| demo.launch( | |
| ssr_mode=False, | |
| mcp_server=True | |
| ) | |
| else: | |
| print("❌ Failed to load model. Please check your HF_KEY and model access permissions.") |