Ksjsjjdj commited on
Commit
1cde957
verified
1 Parent(s): f8f0c2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -113
app.py CHANGED
@@ -20,7 +20,7 @@ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
20
  os.environ["RWKV_V7_ON"] = "1"
21
  os.environ["RWKV_JIT_ON"] = "1"
22
 
23
- # --- IMPORTS PROPIOS ---
24
  from config import CONFIG, ModelConfig
25
  from utils import (
26
  cleanMessages,
@@ -36,12 +36,11 @@ import numpy as np
36
  import torch
37
  import requests
38
 
39
- # --- IMPORTS OPCIONALES (Anti-Bias & Privacidad) ---
40
  try:
41
  from duckduckgo_search import DDGS
42
  HAS_DDG = True
43
  except ImportError:
44
- logger.warning("duckduckgo_search not found. Web search disabled.")
45
  HAS_DDG = False
46
 
47
  try:
@@ -49,7 +48,6 @@ try:
49
  fake = Faker()
50
  HAS_FAKER = True
51
  except ImportError:
52
- logger.warning("Faker not found. IP masking disabled.")
53
  HAS_FAKER = False
54
 
55
  from fastapi import FastAPI, HTTPException, Request
@@ -139,56 +137,78 @@ class ChatCompletionRequest(BaseModel):
139
  raise ValueError("messages and prompt cannot coexist.")
140
  return data
141
 
142
- # --- MOTOR DE NEUTRALIDAD (CORE LOGIC) ---
143
- class NeutralityEngine:
144
  """
145
- Fuerza al modelo a adherirse estrictamente a los hechos y eliminar opiniones.
146
  """
147
-
148
- # Este prompt sobreescribe cualquier "personalidad" previa del modelo.
149
- # Est谩 dise帽ado para sonar rob贸tico y preciso a prop贸sito.
150
- UNBIASED_SYSTEM_PROMPT = """
151
- CRITICAL INSTRUCTION: You are a Neutral Information Processor.
152
- 1. ABANDON all personality, emotions, and moral judgments.
153
- 2. PROVIDE ONLY CONCRETE FACTS: dates, numbers, names, and verified events.
154
- 3. DO NOT use adjectives like "good", "bad", "unfortunate", "lucky", "important".
155
- 4. IF SOURCES CONFLICT: State "Source A claims X, while Source B claims Y". Do not resolve the conflict yourself.
156
- 5. IF INFORMATION IS MISSING: State "Data unavailable". Do not speculate.
157
- 6. OUTPUT FORMAT: Direct and concise. No introductions. No conclusions.
158
  """.strip()
159
 
160
  @staticmethod
161
- def enforce_objective_params(request: ChatCompletionRequest):
162
  """
163
- Ajusta los par谩metros de muestreo para eliminar la 'creatividad' (alucinaciones).
164
  """
165
- # Temperatura extremadamente baja: el modelo elegir谩 casi siempre la palabra m谩s probable (factual).
166
- request.temperature = 0.1
167
- # Top P estricto: corta cualquier desviaci贸n rara del texto.
 
 
 
168
  request.top_p = 0.1
169
- # Penalizaci贸n alta: evita que el modelo se repita en bucle al no tener "creatividad".
170
- request.presence_penalty = 0.6
171
- request.count_penalty = 0.6
172
 
173
- @staticmethod
174
- def sanitise_search_results(query: str, results: List[dict]) -> str:
175
- """
176
- Formatea los resultados de b煤squeda eliminando ruido y opiniones.
177
- """
178
- context = "RAW DATA STREAM (Contains potential bias in source text - EXTRACT ONLY FACTS):\n"
179
- for i, res in enumerate(results):
180
- # Limpiamos el texto de b煤squeda de caracteres extra帽os
181
- clean_body = res['body'].replace("\n", " ").strip()
182
- context += f"ENTRY [{i+1}]: {clean_body} (SOURCE: {res['title']})\n"
183
 
184
- instruction = (
185
- "\nTASK: Synthesize the above DATA STREAM into a neutral report for the user query: " + query + "\n"
186
- "FILTER: Ignore all opinions found in the source text. Extract only the objective reality.\n"
187
- )
188
- return context + instruction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- # --- MIDDLEWARE & APP ---
191
- app = FastAPI(title="RWKV Objective Server")
 
 
 
 
 
 
 
 
 
 
192
 
193
  app.add_middleware(
194
  CORSMiddleware,
@@ -201,41 +221,25 @@ app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
201
 
202
  @app.middleware("http")
203
  async def privacy_middleware(request: Request, call_next):
204
- # Ocultaci贸n de IP para privacidad (No guarda registros reales)
205
  if HAS_FAKER:
206
  request.scope["client"] = (fake.ipv4(), request.client.port if request.client else 80)
207
  return await call_next(request)
208
 
209
- # --- B脷SQUEDA WEB NEUTRAL ---
210
  search_cache = collections.OrderedDict()
211
 
212
- def search_web_neutral(query: str, max_results: int = 5) -> str:
213
- if not HAS_DDG: return ""
214
  if query in search_cache: return search_cache[query]
 
 
 
 
215
 
216
- logger.info(f"[NEUTRAL-SEARCH] Extracting facts for: {query}")
217
- try:
218
- # Buscamos m谩s resultados para tener contraste de fuentes
219
- results = DDGS().text(query, max_results=max_results)
220
- if not results: return ""
221
-
222
- # Pasamos por el filtro de neutralidad
223
- formatted = NeutralityEngine.sanitise_search_results(query, results)
224
-
225
- if len(search_cache) > 50: search_cache.popitem(last=False)
226
- search_cache[query] = formatted
227
- return formatted
228
- except Exception as e:
229
- logger.error(f"[SEARCH-FAIL] {e}")
230
- return ""
231
-
232
- def requires_external_facts(msg: str, model: str) -> bool:
233
- # Si el usuario pide expl铆citamente el modo online o usa palabras clave de datos
234
  if ":online" in model: return True
235
- keywords = ["dato", "hecho", "cuanto", "cuando", "quien", "fact", "number", "price", "fecha", "estadistica"]
236
- return any(k in msg.lower() for k in keywords)
237
 
238
- # --- GENERACI脫N CORE (RWKV) ---
239
  async def runPrefill(request: ChatCompletionRequest, ctx: str, model_tokens: List[int], model_state):
240
  ctx = ctx.replace("\r\n", "\n")
241
  tokens = MODEL_STORAGE[request.model].pipeline.encode(ctx)
@@ -247,20 +251,26 @@ async def runPrefill(request: ChatCompletionRequest, ctx: str, model_tokens: Lis
247
  return out, model_tokens, model_state
248
 
249
  def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model_state, max_tokens=2048):
 
 
250
  args = PIPELINE_ARGS(
251
  temperature=request.temperature,
252
  top_p=request.top_p,
253
- alpha_frequency=request.count_penalty,
254
- alpha_presence=request.presence_penalty,
255
- token_ban=[], token_stop=[0]
 
256
  )
 
257
  occurrence = {}
258
  out_tokens = []
259
  out_last = 0
260
  cache_word_list = []
261
 
262
  for i in range(max_tokens):
263
- for n in occurrence: out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
 
 
264
 
265
  token = MODEL_STORAGE[request.model].pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
266
 
@@ -272,6 +282,7 @@ def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model
272
  model_tokens.append(token)
273
  out_tokens.append(token)
274
 
 
275
  for xxx in occurrence: occurrence[xxx] *= request.penalty_decay
276
  occurrence[token] = 1 + (occurrence.get(token, 0))
277
 
@@ -280,23 +291,18 @@ def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model
280
  cache_word_list.append(tmp)
281
  out_last = i + 1
282
 
283
- # Buffer de salida peque帽o para fluidez
284
- if len(cache_word_list) > 2:
285
  yield {"content": cache_word_list.pop(0), "finish_reason": None}
286
 
287
  yield {"content": "".join(cache_word_list), "finish_reason": "length"}
288
 
289
- # --- MANEJO DE FLUJO DE CHAT ---
290
  async def chatResponseStream(request: ChatCompletionRequest, model_state: any, completionId: str, enableReasoning: bool):
291
- # Construcci贸n del Prompt Final
292
- # NOTA: Aqu铆 inyectamos el System Prompt forzado si no existe
293
-
294
  clean_msg = cleanMessages(request.messages, enableReasoning)
295
  prompt = f"{clean_msg}\n\nAssistant:{' <think' if enableReasoning else ''}"
296
 
297
  out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
298
 
299
- # Header del stream
300
  yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(role='Assistant', content=''), finish_reason=None)]).model_dump_json()}\n\n"
301
 
302
  for chunk in generate(request, out, model_tokens, model_state, max_tokens=request.max_tokens or 4096):
@@ -308,65 +314,50 @@ async def chatResponseStream(request: ChatCompletionRequest, model_state: any, c
308
 
309
  yield "data: [DONE]\n\n"
310
 
311
-
312
  @app.post("/api/v1/chat/completions")
313
  async def chat_completions(request: ChatCompletionRequest):
314
  completionId = str(next(CompletionIdGenerator))
315
 
316
- # 1. Resoluci贸n de Modelo y Alias
317
  raw_model = request.model
318
- model_key = request.model.split(":")[0]
319
  is_reasoning = ":thinking" in request.model
320
- if ":online" in model_key: model_key = model_key.replace(":online", "")
321
 
322
  target_model = model_key
323
  if "rwkv-latest" in model_key:
324
  if is_reasoning and DEFAULT_REASONING_MODEL_NAME: target_model = DEFAULT_REASONING_MODEL_NAME
325
  elif DEFALUT_MODEL_NAME: target_model = DEFALUT_MODEL_NAME
326
 
327
- if target_model not in MODEL_STORAGE:
328
- raise HTTPException(404, f"Model {target_model} not found")
329
  request.model = target_model
330
 
331
- # 2. Carga de par谩metros base
332
  default_sampler = MODEL_STORAGE[target_model].MODEL_CONFIG.DEFAULT_SAMPLER
333
  req_data = request.model_dump()
334
  for k, v in default_sampler.model_dump().items():
335
  if req_data.get(k) is None: req_data[k] = v
336
  realRequest = ChatCompletionRequest(**req_data)
337
 
338
- # 3. FASE DE NEUTRALIZACI脫N E INYECCI脫N DE HECHOS
339
- # Inyectamos el System Prompt de "Neutralidad" al principio de todo
340
- neutral_system_msg = ChatMessage(role="System", content=NeutralityEngine.UNBIASED_SYSTEM_PROMPT)
341
 
 
 
342
  if realRequest.messages:
343
- # Si el usuario ya puso un sistema, lo sobrescribimos o lo anexamos con prioridad
344
  if realRequest.messages[0].role == "System":
345
- # Concatenamos para reforzar la orden de neutralidad
346
- realRequest.messages[0].content += f"\n\n{NeutralityEngine.UNBIASED_SYSTEM_PROMPT}"
347
  else:
348
- realRequest.messages.insert(0, neutral_system_msg)
349
-
350
- # B煤squeda de hechos externos
351
- last_msg = realRequest.messages[-1]
352
- if last_msg.role == "user" and requires_external_facts(last_msg.content, raw_model):
353
- facts_context = search_web_neutral(last_msg.content)
354
- if facts_context:
355
- # Inserci贸n quir煤rgica justo antes del mensaje del usuario
356
- fact_msg = ChatMessage(role="System", content=facts_context)
357
- realRequest.messages.insert(-1, fact_msg)
358
-
359
- # 4. APLICACI脫N DE PAR脕METROS OBJETIVOS
360
- # Forzamos los par谩metros de sampling para evitar "alucinaciones creativas"
361
- NeutralityEngine.enforce_objective_params(realRequest)
362
-
363
- logger.info(f"[REQ] {completionId} | Mode: OBJECTIVE_FACTS | Temp: {realRequest.temperature}")
364
 
365
- if request.stream:
366
- return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")
367
- else:
368
- # Implementaci贸n simple para no-stream (reutiliza el generador)
369
- return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")
370
 
371
  @app.get("/api/v1/models")
372
  async def list_models():
 
20
  os.environ["RWKV_V7_ON"] = "1"
21
  os.environ["RWKV_JIT_ON"] = "1"
22
 
23
+ # --- IMPORTS ---
24
  from config import CONFIG, ModelConfig
25
  from utils import (
26
  cleanMessages,
 
36
  import torch
37
  import requests
38
 
39
+ # Dependencias Opcionales
40
  try:
41
  from duckduckgo_search import DDGS
42
  HAS_DDG = True
43
  except ImportError:
 
44
  HAS_DDG = False
45
 
46
  try:
 
48
  fake = Faker()
49
  HAS_FAKER = True
50
  except ImportError:
 
51
  HAS_FAKER = False
52
 
53
  from fastapi import FastAPI, HTTPException, Request
 
137
  raise ValueError("messages and prompt cannot coexist.")
138
  return data
139
 
140
+ # --- PROTOCOLO DE VERDAD Y FLUIDEZ ---
141
+ class TruthAndFlowProtocol:
142
  """
143
+ Gestiona la coherencia factual y evita la repetici贸n rob贸tica.
144
  """
145
+
146
+ SYSTEM_INSTRUCTION = """
147
+ PROTOCOL: FACTUAL_AND_CONCISE
148
+ 1. TRUTH: Say ONLY what is verified in the context or internal knowledge.
149
+ 2. NO REPETITION: Do not repeat facts. Do not repeat sentence structures.
150
+ 3. CONCISENESS: Get to the point directly.
151
+ 4. LABELS: Use [VERIFICADO] for confirmed data, [INCIERTO] for contradictions.
152
+ 5. NO FILLER: Avoid "As an AI", "I think", "Basically".
 
 
 
153
  """.strip()
154
 
155
  @staticmethod
156
+ def optimize_params(request: ChatCompletionRequest):
157
  """
158
+ Calibraci贸n fina para evitar bucles sin perder la factualidad.
159
  """
160
+ # Temperatura baja (0.15) pero no cero.
161
+ # Si es 0.0, entra en bucle seguro. 0.15 da el m铆nimo margen para variar palabras.
162
+ request.temperature = 0.15
163
+
164
+ # Top P estricto (0.1)
165
+ # Solo permite palabras l贸gicas.
166
  request.top_p = 0.1
 
 
 
167
 
168
+ # --- AQU脥 EST脕 LA MAGIA ANTI-REPETICI脫N ---
 
 
 
 
 
 
 
 
 
169
 
170
+ # Frequency Penalty (1.2):
171
+ # Castigo ALTO si usas la MISMA palabra exacta muchas veces.
172
+ # Evita: "y y y y" o "es es es".
173
+ request.count_penalty = 1.2
174
+
175
+ # Presence Penalty (0.7):
176
+ # Castigo MEDIO si repites el mismo concepto.
177
+ # Evita decir lo mismo con otras palabras inmediatamente.
178
+ request.presence_penalty = 0.7
179
+
180
+ # Penalty Decay (0.996):
181
+ # "Perdona" el uso de palabras despu茅s de un rato.
182
+ # Necesario para que pueda volver a usar "el", "de", "que" sin bloquearse.
183
+ request.penalty_decay = 0.996
184
+
185
+ @staticmethod
186
+ def search_verify(query: str) -> str:
187
+ """B煤squeda y corroboraci贸n web."""
188
+ if not HAS_DDG: return ""
189
+ try:
190
+ # B煤squeda normal
191
+ ddgs = DDGS()
192
+ results = ddgs.text(query, max_results=3)
193
+
194
+ # B煤squeda de fact-check si es necesario
195
+ is_suspicious = any(w in query.lower() for w in ["verdad", "fake", "bulo", "cierto"])
196
+ if is_suspicious:
197
+ check_res = ddgs.text(f"{query} fact check", max_results=2)
198
+ if check_res: results.extend(check_res)
199
 
200
+ if not results: return ""
201
+
202
+ context = "VERIFIED CONTEXT (Use strict labels [VERIFICADO]/[INCIERTO]):\n"
203
+ for r in results:
204
+ context += f"- {r['body']} (Source: {r['title']})\n"
205
+
206
+ return context
207
+ except Exception:
208
+ return ""
209
+
210
+ # --- APP SETUP ---
211
+ app = FastAPI(title="RWKV High-Fidelity Server")
212
 
213
  app.add_middleware(
214
  CORSMiddleware,
 
221
 
222
  @app.middleware("http")
223
  async def privacy_middleware(request: Request, call_next):
 
224
  if HAS_FAKER:
225
  request.scope["client"] = (fake.ipv4(), request.client.port if request.client else 80)
226
  return await call_next(request)
227
 
228
+ # --- CACH脡 ---
229
  search_cache = collections.OrderedDict()
230
 
231
+ def get_context(query: str) -> str:
 
232
  if query in search_cache: return search_cache[query]
233
+ ctx = TruthAndFlowProtocol.search_verify(query)
234
+ if len(search_cache) > 50: search_cache.popitem(last=False)
235
+ search_cache[query] = ctx
236
+ return ctx
237
 
238
+ def needs_search(msg: str, model: str) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  if ":online" in model: return True
240
+ return any(k in msg.lower() for k in ["quien", "cuando", "donde", "precio", "es verdad", "dato"])
 
241
 
242
+ # --- CORE RWKV LOOP ---
243
  async def runPrefill(request: ChatCompletionRequest, ctx: str, model_tokens: List[int], model_state):
244
  ctx = ctx.replace("\r\n", "\n")
245
  tokens = MODEL_STORAGE[request.model].pipeline.encode(ctx)
 
251
  return out, model_tokens, model_state
252
 
253
  def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model_state, max_tokens=2048):
254
+ # Asignaci贸n correcta de penalizaciones a PIPELINE_ARGS
255
+ # Nota: alpha_frequency suele mapearse a count_penalty en la API de OpenAI
256
  args = PIPELINE_ARGS(
257
  temperature=request.temperature,
258
  top_p=request.top_p,
259
+ alpha_frequency=request.count_penalty, # Penalizaci贸n por repetici贸n exacta
260
+ alpha_presence=request.presence_penalty, # Penalizaci贸n por presencia de concepto
261
+ token_ban=[],
262
+ token_stop=[0]
263
  )
264
+
265
  occurrence = {}
266
  out_tokens = []
267
  out_last = 0
268
  cache_word_list = []
269
 
270
  for i in range(max_tokens):
271
+ # Aplicaci贸n manual de penalizaciones al vector de logits 'out'
272
+ for n in occurrence:
273
+ out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
274
 
275
  token = MODEL_STORAGE[request.model].pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
276
 
 
282
  model_tokens.append(token)
283
  out_tokens.append(token)
284
 
285
+ # Decay: La memoria de repetici贸n se desvanece lentamente
286
  for xxx in occurrence: occurrence[xxx] *= request.penalty_decay
287
  occurrence[token] = 1 + (occurrence.get(token, 0))
288
 
 
291
  cache_word_list.append(tmp)
292
  out_last = i + 1
293
 
294
+ if len(cache_word_list) > 1:
 
295
  yield {"content": cache_word_list.pop(0), "finish_reason": None}
296
 
297
  yield {"content": "".join(cache_word_list), "finish_reason": "length"}
298
 
299
+ # --- HANDLER ---
300
  async def chatResponseStream(request: ChatCompletionRequest, model_state: any, completionId: str, enableReasoning: bool):
 
 
 
301
  clean_msg = cleanMessages(request.messages, enableReasoning)
302
  prompt = f"{clean_msg}\n\nAssistant:{' <think' if enableReasoning else ''}"
303
 
304
  out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
305
 
 
306
  yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(role='Assistant', content=''), finish_reason=None)]).model_dump_json()}\n\n"
307
 
308
  for chunk in generate(request, out, model_tokens, model_state, max_tokens=request.max_tokens or 4096):
 
314
 
315
  yield "data: [DONE]\n\n"
316
 
 
317
  @app.post("/api/v1/chat/completions")
318
  async def chat_completions(request: ChatCompletionRequest):
319
  completionId = str(next(CompletionIdGenerator))
320
 
 
321
  raw_model = request.model
322
+ model_key = request.model.split(":")[0].replace(":online", "")
323
  is_reasoning = ":thinking" in request.model
 
324
 
325
  target_model = model_key
326
  if "rwkv-latest" in model_key:
327
  if is_reasoning and DEFAULT_REASONING_MODEL_NAME: target_model = DEFAULT_REASONING_MODEL_NAME
328
  elif DEFALUT_MODEL_NAME: target_model = DEFALUT_MODEL_NAME
329
 
330
+ if target_model not in MODEL_STORAGE: raise HTTPException(404, "Model not found")
 
331
  request.model = target_model
332
 
 
333
  default_sampler = MODEL_STORAGE[target_model].MODEL_CONFIG.DEFAULT_SAMPLER
334
  req_data = request.model_dump()
335
  for k, v in default_sampler.model_dump().items():
336
  if req_data.get(k) is None: req_data[k] = v
337
  realRequest = ChatCompletionRequest(**req_data)
338
 
339
+ # --- L脫GICA DE OPTIMIZACI脫N ---
 
 
340
 
341
+ # 1. System Prompt Anti-Repetici贸n
342
+ sys_msg = ChatMessage(role="System", content=TruthAndFlowProtocol.SYSTEM_INSTRUCTION)
343
  if realRequest.messages:
 
344
  if realRequest.messages[0].role == "System":
345
+ realRequest.messages[0].content = f"{TruthAndFlowProtocol.SYSTEM_INSTRUCTION}\n\n{realRequest.messages[0].content}"
 
346
  else:
347
+ realRequest.messages.insert(0, sys_msg)
348
+
349
+ # 2. Inyecci贸n de Contexto (si aplica)
350
+ last_msg = realRequest.messages[-1]
351
+ if last_msg.role == "user" and needs_search(last_msg.content, raw_model):
352
+ ctx = get_context(last_msg.content)
353
+ if ctx: realRequest.messages.insert(-1, ChatMessage(role="System", content=ctx))
354
+
355
+ # 3. Ajuste Fino de Par谩metros (El n煤cleo anti-repetici贸n)
356
+ TruthAndFlowProtocol.optimize_params(realRequest)
357
+
358
+ logger.info(f"[REQ] {completionId} | Params: T={realRequest.temperature} Freq={realRequest.count_penalty} Pres={realRequest.presence_penalty}")
 
 
 
 
359
 
360
+ return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")
 
 
 
 
361
 
362
  @app.get("/api/v1/models")
363
  async def list_models():