Spaces:

perellorets
/

alia

Paused

App Files Files Community

perellorets commited on Jan 18

Commit

ffbd730

verified ·

1 Parent(s): 2a0c31e

Update rag_system.py

Browse files

Files changed (1) hide show

rag_system.py +261 -261

rag_system.py CHANGED Viewed

@@ -1,261 +1,261 @@
-"""
-Sistema RAG simplificado para Hugging Face Spaces
-Version optimizada con Salamandra 7B Instruct
-"""
-import os
-from typing import List, Dict
-from dataclasses import dataclass
-import torch
-from sentence_transformers import SentenceTransformer
-from qdrant_client import QdrantClient
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import time
-@dataclass
-class RAGResult:
-    """Resultado de una consulta RAG."""
-    query: str
-    answer: str
-    sources: List[Dict]
-    retrieval_time: float
-    generation_time: float
-    total_time: float
-class RAGLLMSystem:
-    """Sistema RAG + Salamandra LLM."""
-    def __init__(self):
-        """Inicializar sistema."""
-        # Configuracion desde variables de entorno
-        self.qdrant_url = os.getenv("QDRANT_URL")
-        self.qdrant_api_key = os.getenv("QDRANT_API_KEY")
-        self.qdrant_collection = os.getenv("QDRANT_COLLECTION", "alia_turismo_docs")
-        # Modelo LLM
-        self.llm_model_name = "BSC-LT/salamandra-7b-instruct"
-        # Modelo de embeddings
-        self.embedding_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
-        # Detectar dispositivo
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        print(f"[RAG] Dispositivo: {self.device}")
-        # Inicializar componentes
-        self._init_qdrant_client()
-        self._init_embedding_model()
-        self._init_salamandra_model()
-    def _init_qdrant_client(self):
-        """Inicializar cliente de Qdrant."""
-        print(f"[RAG] Conectando a Qdrant Cloud...")
-        self.qdrant_client = QdrantClient(
-            url=self.qdrant_url,
-            api_key=self.qdrant_api_key
-        )
-        print(f"[RAG] Conectado a Qdrant")
-    def _init_embedding_model(self):
-        """Inicializar modelo de embeddings."""
-        print(f"[RAG] Cargando modelo de embeddings...")
-        self.embedding_model = SentenceTransformer(
-            self.embedding_model_name,
-            device=self.device
-        )
-        print(f"[RAG] Embeddings cargados")
-    def _init_salamandra_model(self):
-        """Inicializar Salamandra 7B Instruct."""
-        print(f"[RAG] Cargando Salamandra 7B Instruct...")
-        # Cargar tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(self.llm_model_name)
-        # Cargar modelo
-        if self.device == 'cuda':
-            self.llm_model = AutoModelForCausalLM.from_pretrained(
-                self.llm_model_name,
-                torch_dtype=torch.float16,
-                device_map="auto",
-                low_cpu_mem_usage=True
-            )
-            print(f"[RAG] Salamandra cargado en GPU")
-        else:
-            self.llm_model = AutoModelForCausalLM.from_pretrained(
-                self.llm_model_name,
-                torch_dtype=torch.float32,
-                low_cpu_mem_usage=True
-            )
-            print(f"[RAG] Salamandra cargado en CPU")
-        self.llm_model.eval()
-    def retrieve_context(
-        self,
-        query: str,
-        top_k: int = 5,
-        score_threshold: float = 0.6
-    ) -> List[Dict]:
-        """Recuperar documentos relevantes."""
-        # Generar embedding
-        query_embedding = self.embedding_model.encode(
-            query,
-            convert_to_numpy=True
-        )
-        # Buscar en Qdrant
-        results = self.qdrant_client.query_points(
-            collection_name=self.qdrant_collection,
-            query=query_embedding.tolist(),
-            limit=top_k
-        ).points
-        # Filtrar y formatear
-        documents = []
-        for result in results:
-            if result.score >= score_threshold:
-                documents.append({
-                    'content': result.payload.get('full_content', ''),
-                    'filename': result.payload.get('filename', ''),
-                    'category': result.payload.get('category', ''),
-                    'score': result.score,
-                    'id': result.id
-                })
-        return documents
-    def generate_answer(
-        self,
-        query: str,
-        context_docs: List[Dict],
-        max_new_tokens: int = 1024,
-        temperature: float = 0.7,
-        top_p: float = 0.9
-    ) -> str:
-        """Generar respuesta con Salamandra."""
-        # Construir contexto
-        context_text = "\n\n---\n\n".join([
-            f"[Documento: {doc['filename']}]\n{doc['content'][:2000]}"
-            for doc in context_docs
-        ])
-        # Prompt
-        prompt = f"""Eres ALIA, un asistente experto en planificacion estrategica turistica de la Comunidad Valenciana.
-Tu funcion es ayudar a funcionarios publicos, tecnicos de turismo y responsables de destinos turisticos a:
-- Comprender y aplicar estrategias de planes turisticos
-- Obtener informacion sobre mejores practicas en turismo sostenible
-- Consultar casos de exito de otros municipios
-- Disenar e implementar planes estrategicos turisticos
-INSTRUCCIONES:
-1. Responde SIEMPRE basandote en los documentos proporcionados
-2. Si la informacion no esta en los documentos, indica claramente que no la tienes
-3. Cita los documentos fuente cuando sea relevante
-4. Usa un tono profesional pero accesible
-5. Estructura tus respuestas de forma clara con bullets o numeracion cuando sea apropiado
-CONTEXTO (Documentos de planes estrategicos de turismo):
-{context_text}
-PREGUNTA DEL USUARIO:
-{query}
-RESPUESTA:"""
-        # Tokenizar
-        inputs = self.tokenizer(
-            prompt,
-            return_tensors="pt",
-            truncation=True,
-            max_length=4096
-        )
-        # Mover a dispositivo
-        if self.device == 'cuda':
-            inputs = {k: v.cuda() for k, v in inputs.items()}
-        # Generar
-        try:
-            with torch.no_grad():
-                outputs = self.llm_model.generate(
-                    **inputs,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id,
-                )
-            # Decodificar
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extraer respuesta
-            if "RESPUESTA:" in response:
-                response = response.split("RESPUESTA:")[-1].strip()
-            return response
-        except Exception as e:
-            return f"Error generando respuesta: {str(e)}"
-    def query(
-        self,
-        question: str,
-        top_k: int = 5,
-        score_threshold: float = 0.6,
-        max_new_tokens: int = 1024,
-        temperature: float = 0.7
-    ) -> RAGResult:
-        """Procesar consulta completa."""
-        start_time = time.time()
-        # Recuperar contexto
-        retrieval_start = time.time()
-        context_docs = self.retrieve_context(question, top_k, score_threshold)
-        retrieval_time = time.time() - retrieval_start
-        if not context_docs:
-            return RAGResult(
-                query=question,
-                answer="No se encontraron documentos relevantes para responder tu pregunta.",
-                sources=[],
-                retrieval_time=retrieval_time,
-                generation_time=0,
-                total_time=time.time() - start_time
-            )
-        # Generar respuesta
-        generation_start = time.time()
-        answer = self.generate_answer(
-            question,
-            context_docs,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature
-        )
-        generation_time = time.time() - generation_start
-        # Preparar resultado
-        sources = [{
-            'filename': doc['filename'],
-            'category': doc['category'],
-            'score': doc['score']
-        } for doc in context_docs]
-        return RAGResult(
-            query=question,
-            answer=answer,
-            sources=sources,
-            retrieval_time=retrieval_time,
-            generation_time=generation_time,
-            total_time=time.time() - start_time
-        )

+"""
+Sistema RAG simplificado para Hugging Face Spaces
+Version optimizada con Salamandra 7B Instruct
+"""
+import os
+from typing import List, Dict
+from dataclasses import dataclass
+import torch
+from sentence_transformers import SentenceTransformer
+from qdrant_client import QdrantClient
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import time
+@dataclass
+class RAGResult:
+    """Resultado de una consulta RAG."""
+    query: str
+    answer: str
+    sources: List[Dict]
+    retrieval_time: float
+    generation_time: float
+    total_time: float
+class RAGLLMSystem:
+    """Sistema RAG + Salamandra LLM."""
+    def __init__(self):
+        """Inicializar sistema."""
+        # Configuracion desde variables de entorno
+        self.qdrant_url = os.getenv("QDRANT_URL")
+        self.qdrant_api_key = os.getenv("QDRANT_API_KEY")
+        self.qdrant_collection = os.getenv("QDRANT_COLLECTION", "alia_turismo_docs")
+        # Modelo LLM
+        self.llm_model_name = "BSC-LT/salamandra-7b-instruct"
+        # Modelo de embeddings
+        self.embedding_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+        # Detectar dispositivo
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        print(f"[RAG] Dispositivo: {self.device}")
+        # Inicializar componentes
+        self._init_qdrant_client()
+        self._init_embedding_model()
+        self._init_salamandra_model()
+    def _init_qdrant_client(self):
+        """Inicializar cliente de Qdrant."""
+        print(f"[RAG] Conectando a Qdrant Cloud...")
+        self.qdrant_client = QdrantClient(
+            url=self.qdrant_url,
+            api_key=self.qdrant_api_key
+        )
+        print(f"[RAG] Conectado a Qdrant")
+    def _init_embedding_model(self):
+        """Inicializar modelo de embeddings."""
+        print(f"[RAG] Cargando modelo de embeddings...")
+        self.embedding_model = SentenceTransformer(
+            self.embedding_model_name,
+            device=self.device
+        )
+        print(f"[RAG] Embeddings cargados")
+    def _init_salamandra_model(self):
+        """Inicializar Salamandra 7B Instruct con cuantizacion 8-bit."""
+        print(f"[RAG] Cargando Salamandra 7B Instruct (8-bit cuantizado)...")
+        # Cargar tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.llm_model_name)
+        # Cargar modelo con cuantizacion 8-bit para ahorrar memoria
+        if self.device == 'cuda':
+            self.llm_model = AutoModelForCausalLM.from_pretrained(
+                self.llm_model_name,
+                load_in_8bit=True,
+                device_map="auto",
+                low_cpu_mem_usage=True
+            )
+            print(f"[RAG] Salamandra cargado en GPU (8-bit)")
+        else:
+            self.llm_model = AutoModelForCausalLM.from_pretrained(
+                self.llm_model_name,
+                torch_dtype=torch.float32,
+                low_cpu_mem_usage=True
+            )
+            print(f"[RAG] Salamandra cargado en CPU")
+        self.llm_model.eval()
+    def retrieve_context(
+        self,
+        query: str,
+        top_k: int = 5,
+        score_threshold: float = 0.6
+    ) -> List[Dict]:
+        """Recuperar documentos relevantes."""
+        # Generar embedding
+        query_embedding = self.embedding_model.encode(
+            query,
+            convert_to_numpy=True
+        )
+        # Buscar en Qdrant
+        results = self.qdrant_client.query_points(
+            collection_name=self.qdrant_collection,
+            query=query_embedding.tolist(),
+            limit=top_k
+        ).points
+        # Filtrar y formatear
+        documents = []
+        for result in results:
+            if result.score >= score_threshold:
+                documents.append({
+                    'content': result.payload.get('full_content', ''),
+                    'filename': result.payload.get('filename', ''),
+                    'category': result.payload.get('category', ''),
+                    'score': result.score,
+                    'id': result.id
+                })
+        return documents
+    def generate_answer(
+        self,
+        query: str,
+        context_docs: List[Dict],
+        max_new_tokens: int = 1024,
+        temperature: float = 0.7,
+        top_p: float = 0.9
+    ) -> str:
+        """Generar respuesta con Salamandra."""
+        # Construir contexto
+        context_text = "\n\n---\n\n".join([
+            f"[Documento: {doc['filename']}]\n{doc['content'][:2000]}"
+            for doc in context_docs
+        ])
+        # Prompt
+        prompt = f"""Eres ALIA, un asistente experto en planificacion estrategica turistica de la Comunidad Valenciana.
+Tu funcion es ayudar a funcionarios publicos, tecnicos de turismo y responsables de destinos turisticos a:
+- Comprender y aplicar estrategias de planes turisticos
+- Obtener informacion sobre mejores practicas en turismo sostenible
+- Consultar casos de exito de otros municipios
+- Disenar e implementar planes estrategicos turisticos
+INSTRUCCIONES:
+1. Responde SIEMPRE basandote en los documentos proporcionados
+2. Si la informacion no esta en los documentos, indica claramente que no la tienes
+3. Cita los documentos fuente cuando sea relevante
+4. Usa un tono profesional pero accesible
+5. Estructura tus respuestas de forma clara con bullets o numeracion cuando sea apropiado
+CONTEXTO (Documentos de planes estrategicos de turismo):
+{context_text}
+PREGUNTA DEL USUARIO:
+{query}
+RESPUESTA:"""
+        # Tokenizar
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=4096
+        )
+        # Mover a dispositivo
+        if self.device == 'cuda':
+            inputs = {k: v.cuda() for k, v in inputs.items()}
+        # Generar
+        try:
+            with torch.no_grad():
+                outputs = self.llm_model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                )
+            # Decodificar
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extraer respuesta
+            if "RESPUESTA:" in response:
+                response = response.split("RESPUESTA:")[-1].strip()
+            return response
+        except Exception as e:
+            return f"Error generando respuesta: {str(e)}"
+    def query(
+        self,
+        question: str,
+        top_k: int = 5,
+        score_threshold: float = 0.6,
+        max_new_tokens: int = 1024,
+        temperature: float = 0.7
+    ) -> RAGResult:
+        """Procesar consulta completa."""
+        start_time = time.time()
+        # Recuperar contexto
+        retrieval_start = time.time()
+        context_docs = self.retrieve_context(question, top_k, score_threshold)
+        retrieval_time = time.time() - retrieval_start
+        if not context_docs:
+            return RAGResult(
+                query=question,
+                answer="No se encontraron documentos relevantes para responder tu pregunta.",
+                sources=[],
+                retrieval_time=retrieval_time,
+                generation_time=0,
+                total_time=time.time() - start_time
+            )
+        # Generar respuesta
+        generation_start = time.time()
+        answer = self.generate_answer(
+            question,
+            context_docs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature
+        )
+        generation_time = time.time() - generation_start
+        # Preparar resultado
+        sources = [{
+            'filename': doc['filename'],
+            'category': doc['category'],
+            'score': doc['score']
+        } for doc in context_docs]
+        return RAGResult(
+            query=question,
+            answer=answer,
+            sources=sources,
+            retrieval_time=retrieval_time,
+            generation_time=generation_time,
+            total_time=time.time() - start_time
+        )