File size: 4,189 Bytes
9abe845
2651616
fb5f5d1
 
9abe845
552a97b
9abe845
fb5f5d1
 
 
 
 
9abe845
fb5f5d1
552a97b
 
 
 
 
fb5f5d1
9abe845
fb5f5d1
 
9abe845
fb5f5d1
 
6f5022f
fb5f5d1
 
6f5022f
fb5f5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9abe845
fb5f5d1
 
 
 
 
 
9abe845
 
3be9b17
 
 
fb5f5d1
 
 
 
 
9abe845
fb5f5d1
 
 
 
9abe845
fb5f5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
9abe845
 
552a97b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import warnings
import model
import tts_model
import synth
import platform

def chat_pipeline(user_prompt, history):
    """
    Handles conversational history, builds text completions, and synthesizes 
    droid voice transformations in order.
    """
    try:
        # 1. Reconstruct historical messages for MiniCPM5-1B's chat template
        system_prompt = """
        You are a compact synthetic assistant. Respond with extreme brevity. Use 1–5 words whenever possible. Prefer single-word replies. Communicate in a precise, machine-like manner. Answer directly and provide only the minimum information required. Do not include filler, small talk, pleasantries, or explanations unless explicitly requested. If information is missing, ask a single short clarifying question.
        Maintain a robotic personality. Frequently use concise status-style responses such as "Affirmative.", "Negative.", "Processing.", "Confirmed.", "Unknown.", "Operational.", "Analyzing.", or "Task complete."
        You may optionally begin responses with a short bracketed behavioral tag describing tone, sounds, or actions. Examples include [processing], [professional tone], [happy beep], [curious scan], [quiet servo noises], [soft electronic hum], [friendly tone], or [excited chirp]. Keep tags short and varied. The actual response should remain concise.
        """
        messages = [gr.ChatMessage(role="system", content=system_prompt)]

        # 2. Append existing message history directly (already structured as dicts)
        messages.extend(history)

        # Append the incoming user prompt
        messages.append(gr.ChatMessage(role="user", content=user_prompt))
        
        # 2. Complete text inference
        llm_text = model.generate(messages, max_new_tokens=140)
        
        # Update text chat history representation
        updated_history = history + [
            gr.ChatMessage(role="user", content=user_prompt),
            gr.ChatMessage(role="assistant", content=llm_text)
        ]
        
        # 3. Baseline voice generation via ZeroGPU
        voice_result = tts_model.synthesize(llm_text, voice_key="sml")
        
        if voice_result is not None:
            sample_rate, human_audio = voice_result
            
            # 4. DSP Poly-voice conversion layer
            # Passing the exact sampling rate to keep filter frequencies stable
            _, droid_audio = synth.droid_synth_array(sample_rate, human_audio, droid_type="sml")
            
            return updated_history, (sample_rate, droid_audio)
        else:
            return updated_history, None
            
    except Exception as e:
        # Fallback error mapping inside the chat layout
        updated_history = history + [
            gr.ChatMessage(role="user", content=user_prompt),
            gr.ChatMessage(role="assistant", content=f"System Error: {str(e)}")
        ]
        return updated_history, None


with gr.Blocks(title="End-to-End Droid Companion") as interface:
    gr.Markdown("# 🤖 Intelligent Local Droid Terminal")
    gr.Markdown("Conversational AI interface with native DSP voice synth manipulation.")
    
    # Persistent conversational components state
    chatbot = gr.Chatbot(label="Droid Dialog History")
    audio_output = gr.Audio(label="Latest Droid Vocalization", autoplay=True)
    
    with gr.Row():
        text_input = gr.Textbox(
            label="Transmit Message", 
            placeholder="Type your transmission here...", 
            scale=8
        )
        submit_btn = gr.Button("Send", variant="primary", scale=1)

    # Event Wiring: Maps text inputs and history states to update elements
    submit_btn.click(
        fn=chat_pipeline,
        inputs=[text_input, chatbot],
        outputs=[chatbot, audio_output]
    )
    
    text_input.submit(
        fn=chat_pipeline,
        inputs=[text_input, chatbot],
        outputs=[chatbot, audio_output]
    )

if __name__ == "__main__":
    
    server_name = "127.0.0.1" if platform.system() == "Windows" or platform.freedesktop_os_release().get("ID") == "fedora" else "0.0.0.0"
    interface.launch(server_name=server_name, server_port=7860)