File size: 2,993 Bytes
9fb8183
 
 
c33198d
c76bdcd
9fb8183
 
c76bdcd
 
 
 
 
 
 
9fb8183
c76bdcd
c33198d
9fb8183
 
06e12a0
9fb8183
b4c9cb7
c76bdcd
9fb8183
c76bdcd
20e8d3e
c76bdcd
 
 
9fb8183
 
c76bdcd
d15ab26
c76bdcd
 
 
06e12a0
 
 
 
 
c76bdcd
06e12a0
 
 
 
 
 
 
 
 
 
 
 
 
c76bdcd
 
 
9fb8183
c76bdcd
 
b4c9cb7
 
06e12a0
 
b4c9cb7
c76bdcd
06e12a0
c76bdcd
9fb8183
06e12a0
803a159
c76bdcd
 
06e12a0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import json
import subprocess
import requests
import time
import socket
import gradio as gr

# Funci贸n para verificar si el servidor est谩 activo en el puerto
def is_server_active(host, port):
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex((host, port)) == 0

# Descarga y ejecuci贸n del modelo
url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
response = requests.get(url)
with open("./model.gguf", mode="wb") as file:
    file.write(response.content)
print("Model downloaded")

# Ejecutar el servidor LLM
command = ["python3", "-m", "llama_cpp.server", "--model", "./model.gguf", "--host", "0.0.0.0", "--port", "2600", "--n_threads", "2"]
server_process = subprocess.Popen(command)  # Almacenamos el proceso para poder terminarlo m谩s tarde
print("Model server starting...")

# Esperar a que el servidor est茅 activo
while not is_server_active("0.0.0.0", 2600):
    print("Waiting for server to start...")
    time.sleep(5)
print("Model server is ready!")

def response(message, history):
    url = "http://localhost:2600/v1/completions"
    body = {"prompt": "[INST]"+message+"[/INST]", "max_tokens": 1024, "echo": False, "stream": False}
    response_text = ""
    
    try:
        # Eliminado el timeout para esperar indefinidamente
        with requests.post(url, json=body, stream=True) as stream_response:
            for text_chunk in stream_response.iter_content(chunk_size=None):
                text = text_chunk.decode('utf-8')
                print("Respuesta cruda:", text)  # Imprimir la respuesta cruda para depuraci贸n

                if text.startswith("data: "):
                    text = text.replace("data: ", "")
                if text.startswith("{") and "choices" in text:
                    try:
                        response_json = json.loads(text)
                        part = response_json["choices"][0]["text"]
                        print(part, end="", flush=True)
                        response_text += part
                    except json.JSONDecodeError as e:
                        print("Error al decodificar JSON:", e)
                        break
                elif text.strip():
                    print("Respuesta no JSON:", text)
                    break
    except requests.exceptions.RequestException as e:
        print(f"Error al realizar la solicitud: {e}")

    yield response_text

def cleanup_server():
    print("Closing server...")
    server_process.terminate()  # Terminar el proceso del servidor
    server_process.wait()  # Esperar a que el proceso termine
    print("Server closed.")

# Configurar y lanzar la interfaz de Gradio
gr_interface = gr.ChatInterface(
    fn=response,
    title="Mistral-7B-Instruct-v0.2-GGUF Chatbot",
    theme='syddharth/gray-minimal'
)

try:
    gr_interface.launch(share=True)
finally:
    cleanup_server()  # Asegurarse de limpiar el servidor al finalizar