Spaces:
Paused
Paused
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import cohere | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| import os | |
| from dotenv import load_dotenv | |
| import asyncio | |
| from apify_client import ApifyClient | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from sentence_transformers import SentenceTransformer | |
| # Load environment variables | |
| load_dotenv() | |
| #testing | |
| # Initialize Cohere client | |
| co = cohere.Client(os.getenv('COHERE_API_KEY')) | |
| # Initialize Apify client | |
| apify_client = ApifyClient(os.getenv('APIFY_API_KEY')) | |
| # Load the LeetCode dataset | |
| dataset = load_dataset("RayBernard/leetcode", split="train") | |
| df = pd.DataFrame(dataset) | |
| # Initialize sentence transformer model | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Prepare embeddings for the dataset | |
| df['embeddings'] = df['title'].apply(lambda x: model.encode(x)) | |
| def search_apify(query): | |
| query = " ".join(query.split()) | |
| run_input = { | |
| "queries": query, | |
| "resultsPerPage": 10, | |
| "maxPagesPerQuery": 1, | |
| "languageCode": "", | |
| "mobileResults": False, | |
| "includeUnfilteredResults": False, | |
| "saveHtml": False, | |
| "saveHtmlToKeyValueStore": False, | |
| "includeIcons": False | |
| } | |
| run = apify_client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input) | |
| items = apify_client.dataset(run["defaultDatasetId"]).list_items().items | |
| urls = [item['url'] for item in items[0]['organicResults'] if 'url' in item] | |
| return urls[:10] | |
| def scrape_content(url): | |
| try: | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| return soup.get_text() | |
| except: | |
| return "" | |
| def get_embedding(text): | |
| return model.encode(text) | |
| def calculate_relevancy(query_embedding, result_embedding): | |
| query_embedding = np.array(query_embedding).reshape(1, -1) | |
| result_embedding = np.array(result_embedding).reshape(1, -1) | |
| return cosine_similarity(query_embedding, result_embedding)[0][0] | |
| def search_leetcode(query, top_k=5): | |
| query_embedding = get_embedding(query) | |
| df['similarity'] = df['embeddings'].apply(lambda x: calculate_relevancy(query_embedding, x)) | |
| results = df.sort_values('similarity', ascending=False).head(top_k) | |
| return results[['title', 'difficulty', 'similarity']] | |
| async def search_and_score(query): | |
| # Search using Apify | |
| apify_results = search_apify(query) | |
| # Get query embedding | |
| query_embedding = get_embedding(query) | |
| # Process each Apify result | |
| scored_results = [] | |
| for url in apify_results: | |
| content = scrape_content(url) | |
| content_embedding = get_embedding(content) | |
| relevancy_score = calculate_relevancy(query_embedding, content_embedding) | |
| scored_results.append((url, relevancy_score)) | |
| # Sort Apify results by relevancy score | |
| scored_results.sort(key=lambda x: x[1], reverse=True) | |
| # Search LeetCode dataset | |
| leetcode_results = search_leetcode(query) | |
| # Combine results | |
| combined_results = [ | |
| ["Apify Results", "Relevancy Score"], | |
| *[[f'<a href="{url}" target="_blank">{url}</a>', f'{score:.4f}'] for url, score in scored_results], | |
| ["LeetCode Results", "Similarity Score"], | |
| *[["LeetCode: " + row['title'], f"{row['similarity']:.4f}"] for _, row in leetcode_results.iterrows()] | |
| ] | |
| return combined_results | |
| # Wrapper function to run async function in sync context | |
| def search_and_score_wrapper(query): | |
| return asyncio.run(search_and_score(query)) | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=search_and_score_wrapper, | |
| inputs=gr.Textbox(lines=2, placeholder="Enter your search query here..."), | |
| outputs=gr.Dataframe(headers=["Result", "Score"], datatype=["html", "number"]), | |
| title="Search Relevancy Scorer with RAG", | |
| description="Enter a search query to get relevant results from web search and LeetCode dataset." | |
| ) | |
| # Launch the interface | |
| iface.launch() |