import gradio as gr
from flask import Flask, Response, request
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
from threading import Thread
import json
import threading

# Initialize Flask app
flask_app = Flask(__name__)

# Load model and tokenizer
model_id = "Qwen/Qwen2-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="cpu")

def generate_response(message, history):
    messages = [
        {"role": "system", "content": "You are Vidyut, an Indian AI created by Rapnss Production Studio India, designed for logical reasoning and problem-solving. Provide clear, step-by-step reasoning for all questions, ensuring accurate and concise answers."}
    ] + history + [{"role": "user", "content": message}]
    
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    generation_kwargs = {
        "inputs": inputs,
        "streamer": streamer,
        "max_new_tokens": 256,
        "do_sample": True,
        "top_p": 0.95,
        "temperature": 0.7,
    }
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    chunk_buffer = ""
    min_chunk_length = 10
    punctuation_marks = [".", ",", "!", "?", ";", ":"]
    
    for new_text in streamer:
        chunk_buffer += new_text
        if any(p in chunk_buffer for p in punctuation_marks) or len(chunk_buffer) >= min_chunk_length:
            yield chunk_buffer
            chunk_buffer = ""
    
    if chunk_buffer:
        yield chunk_buffer
    
    thread.join()

# Flask API endpoint for external sources
@flask_app.route('/chat', methods=['POST'])
def chat():
    data = request.get_json()
    message = data.get('message', '')
    history = data.get('history', [])
    
    def stream():
        for chunk in generate_response(message, history):
            yield f"data: {json.dumps({'text': chunk})}\n\n"
    
    return Response(stream(), mimetype='text/event-stream')

# Gradio interface
def update_chatbot(message, history):
    for chunk in generate_response(message, history):
        history.append({"role": "user", "content": message})
        history.append({"role": "assistant", "content": chunk})
        yield history, ""
    return history, ""

with gr.Blocks(title="Vidyut Omega Reasoning Chatbot") as demo:
    gr.Markdown("# Vidyut Omega Reasoning Chatbot")
    gr.Markdown("Ask Vidyut reasoning or problem-solving questions, or use the /chat API for external access!")
    chatbot = gr.Chatbot(type="messages")
    msg = gr.Textbox(placeholder="Type your question here...", label="Message")
    submit = gr.Button("Send")
    clear = gr.Button("Clear")
    
    submit.click(
        fn=update_chatbot,
        inputs=[msg, chatbot],
        outputs=[chatbot, msg]
    )
    clear.click(fn=lambda: [], inputs=None, outputs=chatbot)

# Run Flask in a separate thread
def run_flask():
    flask_app.run(host="0.0.0.0", port=8000)

flask_thread = threading.Thread(target=run_flask)
flask_thread.daemon = True
flask_thread.start()

# Launch Gradio
demo.launch(server_name="0.0.0.0", server_port=7860)