jarvis / app.py
nicolaydef's picture
Update app.py
acf2b6b verified
import os
import json
import base64
from flask import Flask, request, jsonify, render_template_string
from huggingface_hub import InferenceClient
app = Flask(__name__)
HF_TOKEN = os.getenv("HF_TOKEN")
# МЕНЯЕМ МОДЕЛЬ НА LLAMA VISION (она более лояльна к API)
MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
client = InferenceClient(MODEL_ID, token=HF_TOKEN)
state = {
"task": None,
"task_id": 0,
"last_reply": "Джарвис на связи.",
"status": "READY",
"raw_ai_output": ""
}
@app.route('/set_task', methods=['POST'])
def set_task():
global state
state["task"] = request.json.get("task")
state["task_id"] = request.json.get("id", 0)
state["status"] = "PENDING"
return jsonify({"status": "ok"})
@app.route('/get_task')
def get_task():
return jsonify({"task": state["task"], "id": state["task_id"]})
@app.route('/process', methods=['POST'])
def process():
global state
data = request.json
img_b64 = data.get("img")
# Очень короткий и ясный промпт
prompt = f"<|image|>\nTask: {state['task']}. You are a PC robot. Output ONLY a JSON array of actions like click(cell 1-100), type(text), press(key), wait(sec), speak(text). Example: [{{'type':'press','key':'win'}},{{'type':'type','text':'notepad'}}]"
try:
response = client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=300
)
ai_res = response.choices[0].message.content.strip()
state["raw_ai_output"] = ai_res
# Извлекаем JSON
start = ai_res.find('[')
end = ai_res.rfind(']') + 1
if start != -1 and end != 0:
json_str = ai_res[start:end]
actions = json.loads(json_str.replace("'", '"'))
else:
raise ValueError("JSON not found in response")
for a in actions:
if a['type'] == 'speak': state["last_reply"] = a['text']
state["task"] = None
state["status"] = "DONE"
return jsonify({"actions": actions})
except Exception as e:
state["status"] = "ERROR"
return jsonify({"error": str(e)}), 500
@app.route('/status')
def get_status(): return jsonify(state)
@app.route('/')
def index():
return render_template_string('''
<!DOCTYPE html>
<html>
<head>
<title>Jarvis Command</title>
<style>
body { background: #000; color: #fff; font-family: -apple-system, sans-serif; display: flex; justify-content: center; align-items: center; min-height: 100vh; margin: 0; }
.glass { background: rgba(255,255,255,0.05); backdrop-filter: blur(20px); border: 1px solid rgba(255,255,255,0.1); border-radius: 30px; padding: 40px; width: 400px; text-align: center; }
input { background: rgba(255,255,255,0.1); border: none; border-radius: 12px; color: #fff; padding: 15px; width: 100%; box-sizing: border-box; margin: 20px 0; outline: none; }
button { background: #0A84FF; border: none; border-radius: 12px; color: #fff; padding: 15px; width: 100%; cursor: pointer; font-weight: bold; }
.status { margin-top: 20px; color: #30D158; font-size: 14px; }
</style>
</head>
<body>
<div class="glass">
<h1>Jarvis OS</h1>
<div id="st" style="opacity:0.3; font-size:10px;">IDLE</div>
<input type="text" id="in" placeholder="Ваша команда...">
<button onclick="s()">ОТПРАВИТЬ</button>
<div id="re" class="status">Ожидание...</div>
<div style="font-size:8px; color:#222; margin-top:10px;" id="raw"></div>
</div>
<script>
let l = "";
async function s() {
const t = document.getElementById('in').value;
await fetch('/set_task', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({task: t, id: Date.now()})});
document.getElementById('in').value = "";
}
setInterval(async () => {
const r = await fetch('/status'); const d = await r.json();
document.getElementById('st').innerText = d.status;
document.getElementById('raw').innerText = d.raw_ai_output;
if(d.last_reply !== l) {
l = d.last_reply; document.getElementById('re').innerText = l;
const u = new SpeechSynthesisUtterance(l); u.lang='ru-RU'; window.speechSynthesis.speak(u);
}
}, 2000);
document.body.onclick = () => window.speechSynthesis.speak(new SpeechSynthesisUtterance(""));
</script>
</body>
</html>
''')
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)