Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import threading
|
|
| 5 |
import time
|
| 6 |
import subprocess
|
| 7 |
import sys
|
| 8 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python==0.3.
|
| 9 |
from llama_cpp import Llama
|
| 10 |
import requests
|
| 11 |
import tempfile
|
|
@@ -16,7 +16,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
| 16 |
app = Flask(__name__)
|
| 17 |
logging.basicConfig(level=logging.INFO)
|
| 18 |
|
| 19 |
-
MAX_CONTEXT_TOKENS = 1024 *
|
| 20 |
MAX_GENERATION_TOKENS = 1024 * 4
|
| 21 |
|
| 22 |
with open('engines.json', 'r') as f:
|
|
@@ -56,7 +56,7 @@ class LLMManager:
|
|
| 56 |
n_threads=2,
|
| 57 |
n_threads_batch=2,
|
| 58 |
use_mlock=True,
|
| 59 |
-
mmap=
|
| 60 |
low_vram=False,
|
| 61 |
vocab_only=False,
|
| 62 |
verbose=False,
|
|
@@ -179,62 +179,22 @@ def home():
|
|
| 179 |
status_html += f"<li>{model_name}: {status}</li>"
|
| 180 |
status_html += "</ul>"
|
| 181 |
|
| 182 |
-
return f'''
|
| 183 |
-
<!DOCTYPE html>
|
| 184 |
-
<html>
|
| 185 |
<head>
|
| 186 |
<title>TeleChars AI API</title>
|
| 187 |
-
<style>
|
| 188 |
-
body {{ font-family: Arial, sans-serif; margin: 40px; }}
|
| 189 |
-
.config {{ background: #f0f0f0; padding: 15px; border-radius: 5px; margin-bottom: 20px; }}
|
| 190 |
-
.endpoint {{ background: #e8f4f8; padding: 10px; border-left: 4px solid #2196F3; margin: 10px 0; }}
|
| 191 |
-
</style>
|
| 192 |
</head>
|
| 193 |
<body>
|
| 194 |
<h1>TeleChars AI API</h1>
|
| 195 |
-
|
| 196 |
-
<
|
| 197 |
-
<h3>⚙️ Configuración</h3>
|
| 198 |
-
<p><strong>Max Context Tokens:</strong> {MAX_CONTEXT_TOKENS}</p>
|
| 199 |
-
<p><strong>Max Generation Tokens:</strong> {MAX_GENERATION_TOKENS}</p>
|
| 200 |
-
</div>
|
| 201 |
-
|
| 202 |
-
<h2>📦 Modelos cargados:</h2>
|
| 203 |
-
{status_html}
|
| 204 |
-
<p>Total modelos: {len(loaded_models)}/{len(MODELS)}</p>
|
| 205 |
-
|
| 206 |
<h2>🔗 Endpoints disponibles:</h2>
|
| 207 |
-
<div class=
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
• temperature= (0.0-2.0)<br>
|
| 212 |
-
• top_p= (0.0-1.0)<br>
|
| 213 |
-
• top_k= (0-100)<br>
|
| 214 |
-
• model= (nombre del modelo)<br>
|
| 215 |
-
• max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})<br>
|
| 216 |
-
• repetition_penalty= (penalización de repetición)<br>
|
| 217 |
-
• presence_penalty= (penalización de presencia)<br>
|
| 218 |
-
• frequency_penalty= (penalización de frecuencia)
|
| 219 |
-
</div>
|
| 220 |
-
|
| 221 |
-
<div class="endpoint">
|
| 222 |
-
<strong>POST /v1/chat/completions</strong><br>
|
| 223 |
-
Compatible con OpenAI API
|
| 224 |
-
</div>
|
| 225 |
-
|
| 226 |
-
<div class="endpoint">
|
| 227 |
-
<strong>GET /health</strong><br>
|
| 228 |
-
Estado del servicio
|
| 229 |
-
</div>
|
| 230 |
-
|
| 231 |
-
<div class="endpoint">
|
| 232 |
-
<strong>GET /models</strong><br>
|
| 233 |
-
Lista todos los modelos disponibles
|
| 234 |
-
</div>
|
| 235 |
</body>
|
| 236 |
-
</html>
|
| 237 |
-
'''
|
| 238 |
|
| 239 |
@app.route('/v1/chat/completions', methods=['POST'])
|
| 240 |
def chat_completions():
|
|
|
|
| 5 |
import time
|
| 6 |
import subprocess
|
| 7 |
import sys
|
| 8 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python==0.3.20"])
|
| 9 |
from llama_cpp import Llama
|
| 10 |
import requests
|
| 11 |
import tempfile
|
|
|
|
| 16 |
app = Flask(__name__)
|
| 17 |
logging.basicConfig(level=logging.INFO)
|
| 18 |
|
| 19 |
+
MAX_CONTEXT_TOKENS = 1024 * 16
|
| 20 |
MAX_GENERATION_TOKENS = 1024 * 4
|
| 21 |
|
| 22 |
with open('engines.json', 'r') as f:
|
|
|
|
| 56 |
n_threads=2,
|
| 57 |
n_threads_batch=2,
|
| 58 |
use_mlock=True,
|
| 59 |
+
mmap=False,
|
| 60 |
low_vram=False,
|
| 61 |
vocab_only=False,
|
| 62 |
verbose=False,
|
|
|
|
| 179 |
status_html += f"<li>{model_name}: {status}</li>"
|
| 180 |
status_html += "</ul>"
|
| 181 |
|
| 182 |
+
return f'''<!DOCTYPE html><html>
|
|
|
|
|
|
|
| 183 |
<head>
|
| 184 |
<title>TeleChars AI API</title>
|
| 185 |
+
<style>body{{font-family:Arial, sans-serif;margin:40px}}.config{{background:#f0f0f0;padding:15px;border-radius:5px;margin-bottom:20px}}.endpoint{{background:#e8f4f8;padding:10px;border-left:4px solid #2196F3;margin:10px 0}}</style>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
</head>
|
| 187 |
<body>
|
| 188 |
<h1>TeleChars AI API</h1>
|
| 189 |
+
<div class=config><h3>⚙️ Configuración</h3><p><strong>Max Context Tokens:</strong> {MAX_CONTEXT_TOKENS}</p><p><strong>Max Generation Tokens:</strong> {MAX_GENERATION_TOKENS}</p></div>
|
| 190 |
+
<h2>📦 Modelos cargados:</h2>{status_html}<p>Total modelos: {len(loaded_models)}/{len(MODELS)}</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
<h2>🔗 Endpoints disponibles:</h2>
|
| 192 |
+
<div class=endpoint><strong>GET /generate/<mensaje>[?params]</strong><br>Devuelve solo el texto generado. Parámetros opcionales:<br>• system= (instrucciones del sistema)<br>• temperature= (0.0-2.0)<br>• top_p= (0.0-1.0)<br>• top_k= (0-100)<br>• model= (nombre del modelo)<br>• max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})<br>• repetition_penalty= (penalización de repetición)<br>• presence_penalty= (penalización de presencia)<br>• frequency_penalty= (penalización de frecuencia)</div>
|
| 193 |
+
<div class=endpoint><strong>POST /v1/chat/completions</strong><br>Compatible con OpenAI API</div>
|
| 194 |
+
<div class=endpoint><strong>GET /health</strong><br>Estado del servicio</div>
|
| 195 |
+
<div class=endpoint><strong>GET /models</strong><br>Lista todos los modelos disponibles</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
</body>
|
| 197 |
+
</html>'''
|
|
|
|
| 198 |
|
| 199 |
@app.route('/v1/chat/completions', methods=['POST'])
|
| 200 |
def chat_completions():
|