Experimentos_NPL_Quoota

Sleeping

App Files Files Community

Experimentos_NPL_Quoota / app.py

Malaji71

Update app.py

d966bc8 verified 5 months ago

raw

history blame contribute delete

4.4 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	import faiss
	import pickle
	import numpy as np
	import os
	from sentence_transformers import SentenceTransformer

	# === CONFIGURACIÓN ===
	MODEL_NAME = "openai/gpt-oss-20b"
	HF_TOKEN = os.getenv("HF_TOKEN") # ← configurar en Secrets como HF_TOKEN = FS

	if not HF_TOKEN:
	raise RuntimeError("❌ Falta la variable 'HF_TOKEN' en Secrets del Space.")

	# === CARGAR FAISS Y DOCUMENTOS ===
	index_path = "nlp_index.faiss"
	docs_path = "nlp_docs.pkl"

	if not os.path.exists(index_path) or not os.path.exists(docs_path):
	raise FileNotFoundError("❌ Faltan 'nlp_index.faiss' o 'nlp_docs.pkl' en la raíz del Space.")

	index = faiss.read_index(index_path)

	with open(docs_path, "rb") as f:
	data = pickle.load(f)
	texts = data["texts"]
	sources = data["sources"]

	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	# === RECUPERACIÓN SEMÁNTICA ===
	def retrieve_context(query: str, k: int = 2) -> str:
	try:
	emb = embedding_model.encode([query], convert_to_numpy=True).astype('float32')
	emb = emb / np.linalg.norm(emb)
	_, indices = index.search(emb, k)
	return "\n\n".join(texts[i] for i in indices[0])
	except Exception:
	return ""

	# === RESPUESTA CON RAG + STREAMING ===
	def respond(
	message,
	history: list[dict[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	context = retrieve_context(message)
	if context:
	full_prompt = (
	f"Responde usando únicamente la siguiente información de contexto. Si el contexto no responde la pregunta, usa tu conocimiento general pero sé honesto sobre sus límites.\n\n"
	f"--- CONTEXTO ---\n{context}\n--- FIN DEL CONTEXTO ---\n\n"
	f"Pregunta:\n{message}"
	)
	else:
	full_prompt = message

	client = InferenceClient(token=HF_TOKEN, model=MODEL_NAME, timeout=60)

	messages = [{"role": "system", "content": system_message}]
	messages.extend(history)
	messages.append({"role": "user", "content": full_prompt})

	response = ""
	first_token_received = False
	try:
	for chunk in client.chat_completion(
	messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	first_token_received = True
	token = ""
	if chunk.choices and chunk.choices[0].delta.content:
	token = chunk.choices[0].delta.content
	response += token
	yield response
	except Exception as e:
	yield f"⚠️ Error durante la inferencia: {str(e)}"

	if not first_token_received:
	yield "⚠️ El modelo no generó ninguna respuesta. Intenta con una pregunta más clara o específica."

	# === INTERFAZ EN ESPAÑOL ===
	chatbot = gr.ChatInterface(
	respond,
	type="messages",
	title="🧠 Experimentos NPL Quoota",
	description="Asistente basado en literatura de psicología cognitiva y desarrollo humano. Sistema RAG con más de 3.6 millones de caracteres indexados.",
	additional_inputs=[
	gr.Textbox(
	value="Eres un asistente experto en desarrollo humano. Responde con claridad, profundidad y empatía, citando conceptos de los libros si es relevante.",
	label="Mensaje del sistema"
	),
	gr.Slider(
	minimum=1,
	maximum=2048,
	value=2048,
	step=1,
	label="Máximo de tokens de salida",
	info="Número máximo de tokens que el modelo generará en su respuesta. 2048 es el límite del modelo."
	),
	gr.Slider(
	minimum=0.1,
	maximum=4.0,
	value=0.7,
	step=0.1,
	label="Temperatura",
	info="Controla la creatividad: valores bajos (ej. 0.2) dan respuestas más enfocadas y predecibles; valores altos (ej. 1.5+) dan respuestas más variadas, sorprendentes o arriesgadas."
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (muestreo nuclear)",
	info="Filtra opciones improbables. 0.95 equilibra diversidad y coherencia."
	),
	],
	)

	# === LANZAR ===
	with gr.Blocks() as demo:
	chatbot.render()

	if __name__ == "__main__":
	demo.launch()