Spaces:
Paused
Paused
| import os | |
| import json | |
| import asyncio | |
| from typing import List, Dict, Any, Optional | |
| from langchain.prompts import ChatPromptTemplate | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| import time | |
| from langchain_community.tools import BraveSearch | |
| from src.utils.api_key_manager import with_api_manager | |
| from src.helpers.helper import remove_markdown | |
| class SearchEngine: | |
| def __init__( | |
| self, | |
| brave_api_key: Optional[str] = None, | |
| ): | |
| if brave_api_key is None: | |
| if os.getenv("BRAVE_API_KEY") is None: | |
| raise ValueError("BRAVE_API_KEY is not set") | |
| else: | |
| self.brave_api_key = os.getenv("BRAVE_API_KEY") | |
| else: | |
| self.brave_api_key = brave_api_key | |
| async def generate_optimized_query(self, user_query: str, context: str = None, *, llm) -> str: | |
| if context: | |
| template = \ | |
| """Objective: | |
| Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query and relevant past context. | |
| The generated SEO query should enhance visibility, relevance, and ranking on search engines. | |
| Information: | |
| The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices. | |
| Instructions: | |
| 1. Understand the Inputs: | |
| - User Query: This is the current question or statement provided by the user. | |
| - Past Context: This includes any relevant previous interactions, preferences, or information that can inform the understanding of the user's intent. | |
| 2. Analyze the User Intent: | |
| - Determine what the user is seeking to find or achieve with their query. | |
| - Identify keywords and phrases that are central to the user's intent. | |
| 3. Incorporate SEO Best Practices: | |
| - Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords. | |
| - Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely. | |
| - Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs. | |
| - Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation). | |
| - Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing. | |
| 4. Generate the SEO-Optimized Query: | |
| - Combine the insights from the user query and past context. | |
| - Formulate a search query that maximizes SEO potential while staying true to the user's intent. | |
| 5. Review and Refine: | |
| - Ensure the generated query is free from grammatical errors. | |
| - Verify that the query does not include unnecessary or irrelevant keywords. | |
| - Confirm that the query is tailored to improve search engine rankings for the intended content. | |
| 6. Format [IMPORTANT]: | |
| - If the user query is a question, the SEO-optimized query should also be a question. | |
| - If the user query is a statement, the SEO-optimized query should be a clear and concise statement. | |
| - Unless search results would be more accurate if the optimized query was a question. | |
| Example 1: | |
| - User Query: | |
| 'Best vegan restaurants in New York' | |
| - Past Context: | |
| 'User has previously shown interest in healthy eating and sustainability.' | |
| - SEO-Optimized Search Query: | |
| 'Top Vegan Restaurants in New York City for Healthy Dining' | |
| Example 2: | |
| - User Query: | |
| 'Give me a list of the best sci-fi movies' | |
| - Past Context: | |
| 'User has a preference for classic science fiction films. Previous searches include "Blade Runner" and "2001: A Space Odyssey."' | |
| - SEO-Optimized Search Query: | |
| 'What are the top classic science fiction movies to watch that are similar to Blade Runner and 2001: A Space Odyssey?' | |
| Input: | |
| - User Query: | |
| {user_query} | |
| - Past Context: | |
| {context} | |
| Output: | |
| (The generated SEO-friendly query based on the inputs in plain text format without any markdown)""" | |
| else: | |
| template = \ | |
| """Objective: | |
| Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query. | |
| The generated SEO query should enhance visibility, relevance, and ranking on search engines. | |
| Information: | |
| The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices. | |
| Instructions: | |
| 1. Understand the Input: | |
| - User Query: This is the current question or statement provided by the user. | |
| 2. Analyze the User Intent: | |
| - Determine what the user is seeking to find or achieve with their query. | |
| - Identify keywords and phrases that are central to the user's intent. | |
| 3. Incorporate SEO Best Practices: | |
| - Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords. | |
| - Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely. | |
| - Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs. | |
| - Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation). | |
| - Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing. | |
| 4. Generate the SEO-Optimized Query: | |
| - Utilize the insights from the user query. | |
| - Formulate a search query that maximizes SEO potential while staying true to the user's intent. | |
| 5. Review and Refine: | |
| - Ensure the generated query is free from grammatical errors. | |
| - Verify that the query does not include unnecessary or irrelevant keywords. | |
| - Confirm that the query is tailored to improve search engine rankings for the intended content. | |
| 6. Format [IMPORTANT]: | |
| - If the user query is a question, the SEO-optimized query should also be a question. | |
| - If the user query is a statement, the SEO-optimized query should be a clear and concise statement. | |
| - Unless search results would be more accurate if the optimized query was a question. | |
| Example 1: | |
| - User Query: | |
| 'Best vegan restaurants in New York' | |
| - SEO-Optimized Search Query: | |
| 'Top Vegan Restaurants in New York City for Healthy Dining' | |
| Example 2: | |
| - User Query: | |
| 'Give me a list of the best sci-fi movies' | |
| - SEO-Optimized Search Query: | |
| 'What are the top science fiction movies to watch?' | |
| Input: | |
| - User Query: | |
| {user_query} | |
| Output: | |
| (The generated SEO-friendly query based on the input in plain text format without any markdown)""" | |
| prompt_template = ChatPromptTemplate.from_template(template) | |
| prompt = prompt_template.format(context=context, user_query=user_query) | |
| optimized_query = await llm.ainvoke(prompt) | |
| return optimized_query.content.strip() | |
| async def search( | |
| self, | |
| query: str, | |
| num_results: int = 10, | |
| gl: str = 'us', | |
| hl: str = 'en', | |
| safe: str = 'off', | |
| exclude_filetypes: Optional[List[str]] = None | |
| ) -> List[Dict[str, Any]]: | |
| # Construct exclusion string for filetypes (maintaining compatibility) | |
| exclusion = '' | |
| if exclude_filetypes: | |
| exclusion = ' ' + ' '.join([f"NOT filetype:{ft}" for ft in exclude_filetypes]) | |
| modified_query = f"{query}{exclusion}" | |
| print(f"Performing search with query: '{modified_query}', num_results: {num_results}, gl: {gl}, hl: {hl}, safe: {safe}") | |
| try: | |
| all_results = [] | |
| remaining_results = num_results | |
| offset = 0 | |
| while remaining_results > 0 and offset <= 9: # Max offset is 9 | |
| # Calculate count for this page (max 20 per request) | |
| count = min(remaining_results, 20) | |
| # Initialize Brave Search within the method | |
| brave_search = BraveSearch.from_api_key( | |
| api_key=self.brave_api_key, | |
| search_kwargs={ | |
| "count": count, | |
| "offset": offset, | |
| "country": gl, | |
| "search_lang": hl, | |
| "safesearch": safe | |
| } | |
| ) | |
| try: | |
| results_str = await asyncio.to_thread(brave_search.run, modified_query) | |
| page_results = eval(results_str) # Convert string representation of list to actual list | |
| if not page_results: # No more results available | |
| break | |
| except Exception as e: | |
| if "429" in str(e): # Rate limit error | |
| print("Brave API rate limit hit, waiting 1 second...") | |
| await asyncio.sleep(1) | |
| continue | |
| else: | |
| raise e | |
| all_results.extend(page_results) | |
| remaining_results -= len(page_results) | |
| offset += 1 | |
| # Add a delay to avoid hitting the rate limit | |
| await asyncio.sleep(1) | |
| print(f"Total results fetched: {len(all_results)}") | |
| return all_results[:num_results] # Ensure we don't return more than requested | |
| except Exception as e: | |
| raise e | |
| async def filter_urls( | |
| self, | |
| query: str, | |
| category: str, | |
| search_results: List[Dict[str, Any]], | |
| num_results: int = 3, | |
| *, | |
| llm | |
| ) -> List[Dict[str, str]]: | |
| link_info = {} | |
| for result in search_results: | |
| link = result.get("link") | |
| title = result.get("title") | |
| snippet = result.get("snippet") | |
| if link and title and snippet: | |
| link_info[link] = {"title": title, "snippet": snippet} | |
| template = \ | |
| """[IMPORTANT] | |
| This is a very important task. | |
| Please take a deep breath, read the instructions VERY carefully, and think step-by-step before responding. | |
| [PROMPT] | |
| You are an expert at determining the relevance of search results to a given query. | |
| Your task is to re-rank the given search results based on their relevance to the original query. | |
| Use a hybrid of semantic and keyword matching to determine relevance | |
| Consider factors such as: | |
| 1. How well the title and snippet match the query intent | |
| 3. The credibility and authority of the source | |
| 4. The recency of the information (if applicable) | |
| Rules: | |
| 1. Rerank the URLs based on their relevance to the query according to the criteria listed above, from best match to worst match. | |
| 2. Once reranked, select the top best matched results according to the category of the query as defined below: | |
| - Advanced: Select upto 3 top best matched results | |
| - Pro: Select upto 4 top best matched results | |
| - Super: Select upto 5 top best matched results | |
| - Ultra: Select upto 6 top best matched results | |
| 3. [IMPORTANT] Select the MINIMUM number of results (based on the categories above) that are required to answer the query. | |
| 4. The response should only contain a JSON array of objects, each containing 'link', 'title' and 'snippet' keys after reranking and filtering. | |
| Note: Do not include ANY markdown in your response. | |
| [INPUT] | |
| Query Category: | |
| {category} | |
| Query: | |
| {query} | |
| Dictionary Containing Link, Titles and Snippets: | |
| {link_info} | |
| Ranked URLs (JSON array of objects):""" | |
| prompt = ChatPromptTemplate.from_template(template) | |
| response = await llm.ainvoke(prompt.format_messages(category=category, query=query, link_info=link_info)) | |
| cleaned_response = remove_markdown(response.content.strip()) | |
| try: | |
| ranked_links = json.loads(cleaned_response) | |
| print(f"Number of search results after reranking and filtering: {len(ranked_links)}") | |
| return ranked_links | |
| except json.JSONDecodeError: | |
| print("Error decoding JSON response from LLM") | |
| return [{"link": link, "title": info["title"], "snippet": info["snippet"]} for link, info in list(link_info.items())[:num_results]] | |
| if __name__ == "__main__": | |
| # Get the project root directory | |
| project_root = Path(__file__).resolve().parents[2] | |
| # Load environment variables | |
| load_dotenv() | |
| required_env_vars = ["BRAVE_API_KEY"] | |
| missing_vars = [var for var in required_env_vars if os.getenv(var) is None] | |
| if missing_vars: | |
| print(f"Environment variables are not set: {missing_vars}") | |
| exit() | |
| else: | |
| print("All environment variables are set!") | |
| search_engine = SearchEngine() | |
| queries = [ | |
| "Compare the benefits and drawbacks of AI in healthcare", | |
| "What is the impact of AI on healthcare?", | |
| "How is AI used in healthcare?", | |
| "What are the ethical considerations of AI in healthcare?", | |
| "What are the economic and social impacts of artificial intelligence on the job market?", | |
| "How can cold fusion be achieved without violating the laws of thermodynamics? And how can AGI help with that?", | |
| "What are the major obstacles to achieving carbon neutrality in heavy industries like steel and cement? What are the potential solutions?" | |
| ] | |
| async def main(queries: List[str]): | |
| for query in queries: | |
| optimized_query = await search_engine.generate_optimized_query(query) | |
| print(f"\nOriginal Query: {query}") | |
| print(f"Optimized Query: {optimized_query}\n") | |
| start = time.perf_counter() | |
| search_results = await search_engine.search(optimized_query, num_results=2, exclude_filetypes=["pdf"]) | |
| end = time.perf_counter() | |
| print(f"Time taken to fetch search results: {end - start:.2f} seconds") | |
| # filtered_search = search_engine.filter_urls( | |
| # optimized_query, | |
| # category="Advanced", | |
| # search_results=search_results, | |
| # num_results=2 | |
| # ) | |
| print("Search Results:") | |
| urls = [] | |
| for result in search_results: | |
| print(f"- {result['title']}: {result['link']}: {result['snippet']}") | |
| urls.append(result['link']) | |
| print("-"*20) | |
| asyncio.run(main(queries)) |