import gradio as gr import requests from bs4 import BeautifulSoup import cohere from sklearn.metrics.pairwise import cosine_similarity import numpy as np import os from dotenv import load_dotenv import asyncio from apify_client import ApifyClient import pandas as pd from datasets import load_dataset from sentence_transformers import SentenceTransformer # Load environment variables load_dotenv() #testing # Initialize Cohere client co = cohere.Client(os.getenv('COHERE_API_KEY')) # Initialize Apify client apify_client = ApifyClient(os.getenv('APIFY_API_KEY')) # Load the LeetCode dataset dataset = load_dataset("RayBernard/leetcode", split="train") df = pd.DataFrame(dataset) # Initialize sentence transformer model model = SentenceTransformer('all-MiniLM-L6-v2') # Prepare embeddings for the dataset df['embeddings'] = df['title'].apply(lambda x: model.encode(x)) def search_apify(query): query = " ".join(query.split()) run_input = { "queries": query, "resultsPerPage": 10, "maxPagesPerQuery": 1, "languageCode": "", "mobileResults": False, "includeUnfilteredResults": False, "saveHtml": False, "saveHtmlToKeyValueStore": False, "includeIcons": False } run = apify_client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input) items = apify_client.dataset(run["defaultDatasetId"]).list_items().items urls = [item['url'] for item in items[0]['organicResults'] if 'url' in item] return urls[:10] def scrape_content(url): try: response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') return soup.get_text() except: return "" def get_embedding(text): return model.encode(text) def calculate_relevancy(query_embedding, result_embedding): query_embedding = np.array(query_embedding).reshape(1, -1) result_embedding = np.array(result_embedding).reshape(1, -1) return cosine_similarity(query_embedding, result_embedding)[0][0] def search_leetcode(query, top_k=5): query_embedding = get_embedding(query) df['similarity'] = df['embeddings'].apply(lambda x: calculate_relevancy(query_embedding, x)) results = df.sort_values('similarity', ascending=False).head(top_k) return results[['title', 'difficulty', 'similarity']] async def search_and_score(query): # Search using Apify apify_results = search_apify(query) # Get query embedding query_embedding = get_embedding(query) # Process each Apify result scored_results = [] for url in apify_results: content = scrape_content(url) content_embedding = get_embedding(content) relevancy_score = calculate_relevancy(query_embedding, content_embedding) scored_results.append((url, relevancy_score)) # Sort Apify results by relevancy score scored_results.sort(key=lambda x: x[1], reverse=True) # Search LeetCode dataset leetcode_results = search_leetcode(query) # Combine results combined_results = [ ["Apify Results", "Relevancy Score"], *[[f'{url}', f'{score:.4f}'] for url, score in scored_results], ["LeetCode Results", "Similarity Score"], *[["LeetCode: " + row['title'], f"{row['similarity']:.4f}"] for _, row in leetcode_results.iterrows()] ] return combined_results # Wrapper function to run async function in sync context def search_and_score_wrapper(query): return asyncio.run(search_and_score(query)) # Create Gradio interface iface = gr.Interface( fn=search_and_score_wrapper, inputs=gr.Textbox(lines=2, placeholder="Enter your search query here..."), outputs=gr.Dataframe(headers=["Result", "Score"], datatype=["html", "number"]), title="Search Relevancy Scorer with RAG", description="Enter a search query to get relevant results from web search and LeetCode dataset." ) # Launch the interface iface.launch()