Spaces:
Build error
Build error
| from typing import List, Dict, Any | |
| import numpy as np | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from search_engine import WebSearchEngine | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class RAGEngine: | |
| def __init__(self): | |
| self.web_search = WebSearchEngine() | |
| self.embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={"device": "cpu"} | |
| ) | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=50 | |
| ) | |
| self.vector_store = None | |
| def process_and_store_content(self, content: str, metadata: Dict[str, Any] = None) -> None: | |
| """Process content and store in vector store""" | |
| try: | |
| # Split content into chunks | |
| texts = self.text_splitter.split_text(content) | |
| # Create metadata for each chunk | |
| metadatas = [metadata or {}] * len(texts) | |
| # Initialize or update vector store | |
| if self.vector_store is None: | |
| self.vector_store = FAISS.from_texts(texts, self.embeddings, metadatas=metadatas) | |
| else: | |
| self.vector_store.add_texts(texts, metadatas=metadatas) | |
| except Exception as e: | |
| logger.error(f"Error processing content: {str(e)}") | |
| raise | |
| def search_and_process(self, query: str, max_results: int = 5, similarity_k: int = 3) -> Dict: | |
| """Search the web and process results with RAG""" | |
| try: | |
| # Get web search results | |
| web_results = self.web_search.search(query, max_results) | |
| if 'error' in web_results: | |
| return {'error': web_results['error']} | |
| if not web_results.get('results'): | |
| return {'error': 'No results found'} | |
| # Process and store new content | |
| for result in web_results['results']: | |
| if result and isinstance(result, dict) and 'content' in result: | |
| try: | |
| self.process_and_store_content( | |
| result['content'], | |
| metadata={'url': result.get('url'), 'title': result.get('title')} | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error processing result: {str(e)}") | |
| continue | |
| # Perform similarity search if we have stored vectors | |
| if self.vector_store: | |
| try: | |
| similar_docs = self.vector_store.similarity_search_with_score( | |
| query, | |
| k=similarity_k | |
| ) | |
| # Add similarity results to web results | |
| web_results['similar_chunks'] = [ | |
| { | |
| 'content': doc[0].page_content, | |
| 'metadata': doc[0].metadata, | |
| 'similarity_score': float(doc[1]) | |
| } | |
| for doc in similar_docs | |
| ] | |
| except Exception as e: | |
| logger.error(f"Error in similarity search: {str(e)}") | |
| return web_results | |
| except Exception as e: | |
| logger.error(f"Error in search_and_process: {str(e)}") | |
| return {'error': f"Search failed: {str(e)}"} | |
| def get_relevant_context(self, query: str, k: int = 3) -> List[Dict]: | |
| """Get most relevant context from vector store""" | |
| if not self.vector_store: | |
| return [] | |
| similar_docs = self.vector_store.similarity_search_with_score(query, k=k) | |
| return [ | |
| { | |
| 'content': doc[0].page_content, | |
| 'metadata': doc[0].metadata, | |
| 'similarity_score': float(doc[1]) | |
| } | |
| for doc in similar_docs | |
| ] | |