Ksjsjjdj commited on
Commit
f8f0c2e
·
verified ·
1 Parent(s): bf3068d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -92
app.py CHANGED
@@ -11,7 +11,7 @@ import asyncio
11
  import random
12
  from typing import List, Optional, Union, Any, Dict
13
 
14
- # --- LIBRERÍAS DE TERCEROS ---
15
  if os.environ.get("MODELSCOPE_ENVIRONMENT") == "studio":
16
  from modelscope import patch_hub
17
  patch_hub()
@@ -20,6 +20,7 @@ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
20
  os.environ["RWKV_V7_ON"] = "1"
21
  os.environ["RWKV_JIT_ON"] = "1"
22
 
 
23
  from config import CONFIG, ModelConfig
24
  from utils import (
25
  cleanMessages,
@@ -28,7 +29,6 @@ from utils import (
28
  format_bytes,
29
  log,
30
  )
31
-
32
  from huggingface_hub import hf_hub_download
33
  from loguru import logger
34
  from snowflake import SnowflakeGenerator
@@ -36,6 +36,7 @@ import numpy as np
36
  import torch
37
  import requests
38
 
 
39
  try:
40
  from duckduckgo_search import DDGS
41
  HAS_DDG = True
@@ -58,7 +59,7 @@ from fastapi.staticfiles import StaticFiles
58
  from fastapi.middleware.gzip import GZipMiddleware
59
  from pydantic import BaseModel, Field, model_validator
60
 
61
- # --- INICIALIZACIÓN ---
62
  CompletionIdGenerator = SnowflakeGenerator(42, timestamp=1741101491595)
63
 
64
  if "cuda" in CONFIG.STRATEGY.lower() and not torch.cuda.is_available():
@@ -82,7 +83,7 @@ from api_types import (
82
  ChatCompletionChoice, ChatCompletionMessage
83
  )
84
 
85
- # --- MODEL STORAGE ---
86
  class ModelStorage:
87
  MODEL_CONFIG: Optional[ModelConfig] = None
88
  model: Optional[RWKV] = None
@@ -115,7 +116,7 @@ for model_config in CONFIG.MODELS:
115
  torch.cuda.empty_cache()
116
  gc.collect()
117
 
118
- # --- CLASES Y TYPES ---
119
  class ChatCompletionRequest(BaseModel):
120
  model: str = Field(default="rwkv-latest")
121
  messages: Optional[List[ChatMessage]] = Field(default=None)
@@ -138,49 +139,56 @@ class ChatCompletionRequest(BaseModel):
138
  raise ValueError("messages and prompt cannot coexist.")
139
  return data
140
 
141
- # --- COHERENCE ENGINE ---
142
- class CoherenceEngine:
143
  """
144
- Ajusta dinámicamente los parámetros del modelo para asegurar coherencia y sentido.
145
  """
146
- @staticmethod
147
- def optimize_parameters(request: ChatCompletionRequest, has_search_results: bool):
148
- # 1. Si hay resultados de búsqueda, bajamos la temperatura para ser FACTUALES
149
- if has_search_results:
150
- logger.info("[COHERENCE] Search results detected. Switching to FACTUAL mode.")
151
- # Temperatura baja para adherirse a los datos
152
- request.temperature = 0.2
153
- # Top P bajo para eliminar palabras raras
154
- request.top_p = 0.15
155
- # Penalización alta para evitar repetir los hechos
156
- request.presence_penalty = 0.5
157
- else:
158
- # Modo Conversación Normal
159
- if request.temperature is None: request.temperature = 1.0
160
- if request.top_p is None: request.top_p = 0.7
161
 
162
- # 2. Protección contra Loops (Repetición)
163
- if request.penalty_decay is None:
164
- request.penalty_decay = 0.996 # Standard decay
 
 
 
 
 
 
 
 
 
165
 
166
  @staticmethod
167
- def format_search_prompt(query: str, results: List[dict]) -> str:
168
- """Crea un prompt estructurado diseñado para que RWKV no se confunda."""
169
- context = "Reference Information:\n"
 
 
170
  for i, res in enumerate(results):
171
- context += f"[{i+1}] {res['body']} (Source: {res['title']})\n"
 
 
172
 
173
- # Instrucción estricta para el modelo
174
  instruction = (
175
- "\nINSTRUCTION: "
176
- "Answer the user's question using ONLY the Reference Information above. "
177
- "Do not make up facts. If the information is missing, say 'I don't know based on the search results'. "
178
- "Write coherently and clearly.\n"
179
  )
180
  return context + instruction
181
 
182
- # --- APP SETUP ---
183
- app = FastAPI(title="RWKV Intelligent Server")
184
 
185
  app.add_middleware(
186
  CORSMiddleware,
@@ -191,48 +199,47 @@ app.add_middleware(
191
  )
192
  app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
193
 
194
- # --- MIDDLEWARE: FAKER IP ---
195
  @app.middleware("http")
196
- async def security_middleware(request: Request, call_next):
 
197
  if HAS_FAKER:
198
  request.scope["client"] = (fake.ipv4(), request.client.port if request.client else 80)
199
- response = await call_next(request)
200
- return response
201
 
202
- # --- SEARCH LOGIC ---
203
  search_cache = collections.OrderedDict()
204
 
205
- def search_web(query: str, max_results: int = 4) -> str:
206
  if not HAS_DDG: return ""
207
  if query in search_cache: return search_cache[query]
208
 
209
- logger.info(f"[SEARCH] Querying: {query}")
210
  try:
 
211
  results = DDGS().text(query, max_results=max_results)
212
  if not results: return ""
213
 
214
- # Usamos el CoherenceEngine para formatear
215
- formatted_context = CoherenceEngine.format_search_prompt(query, results)
216
 
217
- # Cache simple
218
  if len(search_cache) > 50: search_cache.popitem(last=False)
219
- search_cache[query] = formatted_context
220
- return formatted_context
221
  except Exception as e:
222
- logger.error(f"[SEARCH] Error: {e}")
223
  return ""
224
 
225
- def should_search(msg: str, model: str) -> bool:
 
226
  if ":online" in model: return True
227
- keywords = ["buscar", "google", "actualidad", "noticia", "quien es", "precio", "clima", "search", "news"]
228
  return any(k in msg.lower() for k in keywords)
229
 
230
- # --- CORE GENERATION ---
231
  async def runPrefill(request: ChatCompletionRequest, ctx: str, model_tokens: List[int], model_state):
232
  ctx = ctx.replace("\r\n", "\n")
233
  tokens = MODEL_STORAGE[request.model].pipeline.encode(ctx)
234
- tokens = [int(x) for x in tokens]
235
- model_tokens += tokens
236
  while len(tokens) > 0:
237
  out, model_state = MODEL_STORAGE[request.model].model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
238
  tokens = tokens[CONFIG.CHUNK_LEN :]
@@ -241,7 +248,7 @@ async def runPrefill(request: ChatCompletionRequest, ctx: str, model_tokens: Lis
241
 
242
  def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model_state, max_tokens=2048):
243
  args = PIPELINE_ARGS(
244
- temperature=max(0.1, request.temperature), # Evitar temp 0 absoluta
245
  top_p=request.top_p,
246
  alpha_frequency=request.count_penalty,
247
  alpha_presence=request.presence_penalty,
@@ -273,18 +280,23 @@ def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model
273
  cache_word_list.append(tmp)
274
  out_last = i + 1
275
 
276
- if len(cache_word_list) > 5:
 
277
  yield {"content": cache_word_list.pop(0), "finish_reason": None}
278
 
279
  yield {"content": "".join(cache_word_list), "finish_reason": "length"}
280
 
281
- # --- ENDPOINTS ---
282
  async def chatResponseStream(request: ChatCompletionRequest, model_state: any, completionId: str, enableReasoning: bool):
283
- # Prompt construction
284
- prompt = f"{cleanMessages(request.messages, enableReasoning)}\n\nAssistant:{' <think' if enableReasoning else ''}"
 
 
 
285
 
286
  out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
287
 
 
288
  yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(role='Assistant', content=''), finish_reason=None)]).model_dump_json()}\n\n"
289
 
290
  for chunk in generate(request, out, model_tokens, model_state, max_tokens=request.max_tokens or 4096):
@@ -296,62 +308,69 @@ async def chatResponseStream(request: ChatCompletionRequest, model_state: any, c
296
 
297
  yield "data: [DONE]\n\n"
298
 
 
299
  @app.post("/api/v1/chat/completions")
300
  async def chat_completions(request: ChatCompletionRequest):
301
  completionId = str(next(CompletionIdGenerator))
302
 
303
- # 1. Model Resolution
304
  raw_model = request.model
305
  model_key = request.model.split(":")[0]
306
  is_reasoning = ":thinking" in request.model
307
  if ":online" in model_key: model_key = model_key.replace(":online", "")
308
 
309
- # Alias Mapping
310
- target_model_name = model_key
311
  if "rwkv-latest" in model_key:
312
- if is_reasoning and DEFAULT_REASONING_MODEL_NAME: target_model_name = DEFAULT_REASONING_MODEL_NAME
313
- elif DEFALUT_MODEL_NAME: target_model_name = DEFALUT_MODEL_NAME
314
 
315
- if target_model_name not in MODEL_STORAGE:
316
- raise HTTPException(404, f"Model {target_model_name} not found")
317
-
318
- request.model = target_model_name
319
-
320
- # 2. Defaults
321
- default_sampler = MODEL_STORAGE[target_model_name].MODEL_CONFIG.DEFAULT_SAMPLER
322
  req_data = request.model_dump()
323
  for k, v in default_sampler.model_dump().items():
324
  if req_data.get(k) is None: req_data[k] = v
325
  realRequest = ChatCompletionRequest(**req_data)
326
 
327
- # 3. ADVANCED MECHANISM: SEARCH & CONTEXT INJECTION
328
- has_search = False
329
- if realRequest.messages and realRequest.messages[-1].role == "user":
330
- last_msg = realRequest.messages[-1].content
331
- if should_search(last_msg, raw_model):
332
- context = search_web(last_msg)
333
- if context:
334
- has_search = True
335
- # Inyectamos el contexto JUSTO antes del último mensaje del usuario
336
- # Esto es crucial para la coherencia en RWKV
337
- system_msg = ChatMessage(role="System", content=context)
338
- realRequest.messages.insert(-1, system_msg)
339
 
340
- # 4. ADVANCED MECHANISM: COHERENCE OPTIMIZATION
341
- # Aquí es donde ocurre la magia de "que tenga sentido"
342
- CoherenceEngine.optimize_parameters(realRequest, has_search)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
- logger.info(f"[REQ] {completionId} | Model: {realRequest.model} | Search: {has_search} | Temp: {realRequest.temperature}")
345
 
346
  if request.stream:
347
  return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")
348
-
349
- # (Non-stream implementation simplified for brevity, usually streams used)
350
- return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")
351
 
352
  @app.get("/api/v1/models")
353
  async def list_models():
354
- return {"object": "list", "data": [{"id": "rwkv-latest", "object": "model", "owned_by": "rwkv"}]}
355
 
356
  app.mount("/", StaticFiles(directory="dist-frontend", html=True), name="static")
357
 
 
11
  import random
12
  from typing import List, Optional, Union, Any, Dict
13
 
14
+ # --- CONFIGURACIÓN DE ENTORNO ---
15
  if os.environ.get("MODELSCOPE_ENVIRONMENT") == "studio":
16
  from modelscope import patch_hub
17
  patch_hub()
 
20
  os.environ["RWKV_V7_ON"] = "1"
21
  os.environ["RWKV_JIT_ON"] = "1"
22
 
23
+ # --- IMPORTS PROPIOS ---
24
  from config import CONFIG, ModelConfig
25
  from utils import (
26
  cleanMessages,
 
29
  format_bytes,
30
  log,
31
  )
 
32
  from huggingface_hub import hf_hub_download
33
  from loguru import logger
34
  from snowflake import SnowflakeGenerator
 
36
  import torch
37
  import requests
38
 
39
+ # --- IMPORTS OPCIONALES (Anti-Bias & Privacidad) ---
40
  try:
41
  from duckduckgo_search import DDGS
42
  HAS_DDG = True
 
59
  from fastapi.middleware.gzip import GZipMiddleware
60
  from pydantic import BaseModel, Field, model_validator
61
 
62
+ # --- SETUP INICIAL ---
63
  CompletionIdGenerator = SnowflakeGenerator(42, timestamp=1741101491595)
64
 
65
  if "cuda" in CONFIG.STRATEGY.lower() and not torch.cuda.is_available():
 
83
  ChatCompletionChoice, ChatCompletionMessage
84
  )
85
 
86
+ # --- ALMACENAMIENTO DE MODELOS ---
87
  class ModelStorage:
88
  MODEL_CONFIG: Optional[ModelConfig] = None
89
  model: Optional[RWKV] = None
 
116
  torch.cuda.empty_cache()
117
  gc.collect()
118
 
119
+ # --- CLASES DE DATOS ---
120
  class ChatCompletionRequest(BaseModel):
121
  model: str = Field(default="rwkv-latest")
122
  messages: Optional[List[ChatMessage]] = Field(default=None)
 
139
  raise ValueError("messages and prompt cannot coexist.")
140
  return data
141
 
142
+ # --- MOTOR DE NEUTRALIDAD (CORE LOGIC) ---
143
+ class NeutralityEngine:
144
  """
145
+ Fuerza al modelo a adherirse estrictamente a los hechos y eliminar opiniones.
146
  """
147
+
148
+ # Este prompt sobreescribe cualquier "personalidad" previa del modelo.
149
+ # Está diseñado para sonar robótico y preciso a propósito.
150
+ UNBIASED_SYSTEM_PROMPT = """
151
+ CRITICAL INSTRUCTION: You are a Neutral Information Processor.
152
+ 1. ABANDON all personality, emotions, and moral judgments.
153
+ 2. PROVIDE ONLY CONCRETE FACTS: dates, numbers, names, and verified events.
154
+ 3. DO NOT use adjectives like "good", "bad", "unfortunate", "lucky", "important".
155
+ 4. IF SOURCES CONFLICT: State "Source A claims X, while Source B claims Y". Do not resolve the conflict yourself.
156
+ 5. IF INFORMATION IS MISSING: State "Data unavailable". Do not speculate.
157
+ 6. OUTPUT FORMAT: Direct and concise. No introductions. No conclusions.
158
+ """.strip()
 
 
 
159
 
160
+ @staticmethod
161
+ def enforce_objective_params(request: ChatCompletionRequest):
162
+ """
163
+ Ajusta los parámetros de muestreo para eliminar la 'creatividad' (alucinaciones).
164
+ """
165
+ # Temperatura extremadamente baja: el modelo elegirá casi siempre la palabra más probable (factual).
166
+ request.temperature = 0.1
167
+ # Top P estricto: corta cualquier desviación rara del texto.
168
+ request.top_p = 0.1
169
+ # Penalización alta: evita que el modelo se repita en bucle al no tener "creatividad".
170
+ request.presence_penalty = 0.6
171
+ request.count_penalty = 0.6
172
 
173
  @staticmethod
174
+ def sanitise_search_results(query: str, results: List[dict]) -> str:
175
+ """
176
+ Formatea los resultados de búsqueda eliminando ruido y opiniones.
177
+ """
178
+ context = "RAW DATA STREAM (Contains potential bias in source text - EXTRACT ONLY FACTS):\n"
179
  for i, res in enumerate(results):
180
+ # Limpiamos el texto de búsqueda de caracteres extraños
181
+ clean_body = res['body'].replace("\n", " ").strip()
182
+ context += f"ENTRY [{i+1}]: {clean_body} (SOURCE: {res['title']})\n"
183
 
 
184
  instruction = (
185
+ "\nTASK: Synthesize the above DATA STREAM into a neutral report for the user query: " + query + "\n"
186
+ "FILTER: Ignore all opinions found in the source text. Extract only the objective reality.\n"
 
 
187
  )
188
  return context + instruction
189
 
190
+ # --- MIDDLEWARE & APP ---
191
+ app = FastAPI(title="RWKV Objective Server")
192
 
193
  app.add_middleware(
194
  CORSMiddleware,
 
199
  )
200
  app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
201
 
 
202
  @app.middleware("http")
203
+ async def privacy_middleware(request: Request, call_next):
204
+ # Ocultación de IP para privacidad (No guarda registros reales)
205
  if HAS_FAKER:
206
  request.scope["client"] = (fake.ipv4(), request.client.port if request.client else 80)
207
+ return await call_next(request)
 
208
 
209
+ # --- BÚSQUEDA WEB NEUTRAL ---
210
  search_cache = collections.OrderedDict()
211
 
212
+ def search_web_neutral(query: str, max_results: int = 5) -> str:
213
  if not HAS_DDG: return ""
214
  if query in search_cache: return search_cache[query]
215
 
216
+ logger.info(f"[NEUTRAL-SEARCH] Extracting facts for: {query}")
217
  try:
218
+ # Buscamos más resultados para tener contraste de fuentes
219
  results = DDGS().text(query, max_results=max_results)
220
  if not results: return ""
221
 
222
+ # Pasamos por el filtro de neutralidad
223
+ formatted = NeutralityEngine.sanitise_search_results(query, results)
224
 
 
225
  if len(search_cache) > 50: search_cache.popitem(last=False)
226
+ search_cache[query] = formatted
227
+ return formatted
228
  except Exception as e:
229
+ logger.error(f"[SEARCH-FAIL] {e}")
230
  return ""
231
 
232
+ def requires_external_facts(msg: str, model: str) -> bool:
233
+ # Si el usuario pide explícitamente el modo online o usa palabras clave de datos
234
  if ":online" in model: return True
235
+ keywords = ["dato", "hecho", "cuanto", "cuando", "quien", "fact", "number", "price", "fecha", "estadistica"]
236
  return any(k in msg.lower() for k in keywords)
237
 
238
+ # --- GENERACIÓN CORE (RWKV) ---
239
  async def runPrefill(request: ChatCompletionRequest, ctx: str, model_tokens: List[int], model_state):
240
  ctx = ctx.replace("\r\n", "\n")
241
  tokens = MODEL_STORAGE[request.model].pipeline.encode(ctx)
242
+ model_tokens.extend([int(x) for x in tokens])
 
243
  while len(tokens) > 0:
244
  out, model_state = MODEL_STORAGE[request.model].model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
245
  tokens = tokens[CONFIG.CHUNK_LEN :]
 
248
 
249
  def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model_state, max_tokens=2048):
250
  args = PIPELINE_ARGS(
251
+ temperature=request.temperature,
252
  top_p=request.top_p,
253
  alpha_frequency=request.count_penalty,
254
  alpha_presence=request.presence_penalty,
 
280
  cache_word_list.append(tmp)
281
  out_last = i + 1
282
 
283
+ # Buffer de salida pequeño para fluidez
284
+ if len(cache_word_list) > 2:
285
  yield {"content": cache_word_list.pop(0), "finish_reason": None}
286
 
287
  yield {"content": "".join(cache_word_list), "finish_reason": "length"}
288
 
289
+ # --- MANEJO DE FLUJO DE CHAT ---
290
  async def chatResponseStream(request: ChatCompletionRequest, model_state: any, completionId: str, enableReasoning: bool):
291
+ # Construcción del Prompt Final
292
+ # NOTA: Aquí inyectamos el System Prompt forzado si no existe
293
+
294
+ clean_msg = cleanMessages(request.messages, enableReasoning)
295
+ prompt = f"{clean_msg}\n\nAssistant:{' <think' if enableReasoning else ''}"
296
 
297
  out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
298
 
299
+ # Header del stream
300
  yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(role='Assistant', content=''), finish_reason=None)]).model_dump_json()}\n\n"
301
 
302
  for chunk in generate(request, out, model_tokens, model_state, max_tokens=request.max_tokens or 4096):
 
308
 
309
  yield "data: [DONE]\n\n"
310
 
311
+
312
  @app.post("/api/v1/chat/completions")
313
  async def chat_completions(request: ChatCompletionRequest):
314
  completionId = str(next(CompletionIdGenerator))
315
 
316
+ # 1. Resolución de Modelo y Alias
317
  raw_model = request.model
318
  model_key = request.model.split(":")[0]
319
  is_reasoning = ":thinking" in request.model
320
  if ":online" in model_key: model_key = model_key.replace(":online", "")
321
 
322
+ target_model = model_key
 
323
  if "rwkv-latest" in model_key:
324
+ if is_reasoning and DEFAULT_REASONING_MODEL_NAME: target_model = DEFAULT_REASONING_MODEL_NAME
325
+ elif DEFALUT_MODEL_NAME: target_model = DEFALUT_MODEL_NAME
326
 
327
+ if target_model not in MODEL_STORAGE:
328
+ raise HTTPException(404, f"Model {target_model} not found")
329
+ request.model = target_model
330
+
331
+ # 2. Carga de parámetros base
332
+ default_sampler = MODEL_STORAGE[target_model].MODEL_CONFIG.DEFAULT_SAMPLER
 
333
  req_data = request.model_dump()
334
  for k, v in default_sampler.model_dump().items():
335
  if req_data.get(k) is None: req_data[k] = v
336
  realRequest = ChatCompletionRequest(**req_data)
337
 
338
+ # 3. FASE DE NEUTRALIZACIÓN E INYECCIÓN DE HECHOS
339
+ # Inyectamos el System Prompt de "Neutralidad" al principio de todo
340
+ neutral_system_msg = ChatMessage(role="System", content=NeutralityEngine.UNBIASED_SYSTEM_PROMPT)
 
 
 
 
 
 
 
 
 
341
 
342
+ if realRequest.messages:
343
+ # Si el usuario ya puso un sistema, lo sobrescribimos o lo anexamos con prioridad
344
+ if realRequest.messages[0].role == "System":
345
+ # Concatenamos para reforzar la orden de neutralidad
346
+ realRequest.messages[0].content += f"\n\n{NeutralityEngine.UNBIASED_SYSTEM_PROMPT}"
347
+ else:
348
+ realRequest.messages.insert(0, neutral_system_msg)
349
+
350
+ # Búsqueda de hechos externos
351
+ last_msg = realRequest.messages[-1]
352
+ if last_msg.role == "user" and requires_external_facts(last_msg.content, raw_model):
353
+ facts_context = search_web_neutral(last_msg.content)
354
+ if facts_context:
355
+ # Inserción quirúrgica justo antes del mensaje del usuario
356
+ fact_msg = ChatMessage(role="System", content=facts_context)
357
+ realRequest.messages.insert(-1, fact_msg)
358
+
359
+ # 4. APLICACIÓN DE PARÁMETROS OBJETIVOS
360
+ # Forzamos los parámetros de sampling para evitar "alucinaciones creativas"
361
+ NeutralityEngine.enforce_objective_params(realRequest)
362
 
363
+ logger.info(f"[REQ] {completionId} | Mode: OBJECTIVE_FACTS | Temp: {realRequest.temperature}")
364
 
365
  if request.stream:
366
  return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")
367
+ else:
368
+ # Implementación simple para no-stream (reutiliza el generador)
369
+ return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")
370
 
371
  @app.get("/api/v1/models")
372
  async def list_models():
373
+ return {"object": "list", "data": [{"id": "rwkv-latest", "object": "model"}]}
374
 
375
  app.mount("/", StaticFiles(directory="dist-frontend", html=True), name="static")
376