query / app.py
poemsforaphrodite's picture
Upload folder using huggingface_hub
75eb0e0 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
import cohere
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from dotenv import load_dotenv
import asyncio
from apify_client import ApifyClient
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
# Load environment variables
load_dotenv()
#testing
# Initialize Cohere client
co = cohere.Client(os.getenv('COHERE_API_KEY'))
# Initialize Apify client
apify_client = ApifyClient(os.getenv('APIFY_API_KEY'))
# Load the LeetCode dataset
dataset = load_dataset("RayBernard/leetcode", split="train")
df = pd.DataFrame(dataset)
# Initialize sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Prepare embeddings for the dataset
df['embeddings'] = df['title'].apply(lambda x: model.encode(x))
def search_apify(query):
query = " ".join(query.split())
run_input = {
"queries": query,
"resultsPerPage": 10,
"maxPagesPerQuery": 1,
"languageCode": "",
"mobileResults": False,
"includeUnfilteredResults": False,
"saveHtml": False,
"saveHtmlToKeyValueStore": False,
"includeIcons": False
}
run = apify_client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)
items = apify_client.dataset(run["defaultDatasetId"]).list_items().items
urls = [item['url'] for item in items[0]['organicResults'] if 'url' in item]
return urls[:10]
def scrape_content(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup.get_text()
except:
return ""
def get_embedding(text):
return model.encode(text)
def calculate_relevancy(query_embedding, result_embedding):
query_embedding = np.array(query_embedding).reshape(1, -1)
result_embedding = np.array(result_embedding).reshape(1, -1)
return cosine_similarity(query_embedding, result_embedding)[0][0]
def search_leetcode(query, top_k=5):
query_embedding = get_embedding(query)
df['similarity'] = df['embeddings'].apply(lambda x: calculate_relevancy(query_embedding, x))
results = df.sort_values('similarity', ascending=False).head(top_k)
return results[['title', 'difficulty', 'similarity']]
async def search_and_score(query):
# Search using Apify
apify_results = search_apify(query)
# Get query embedding
query_embedding = get_embedding(query)
# Process each Apify result
scored_results = []
for url in apify_results:
content = scrape_content(url)
content_embedding = get_embedding(content)
relevancy_score = calculate_relevancy(query_embedding, content_embedding)
scored_results.append((url, relevancy_score))
# Sort Apify results by relevancy score
scored_results.sort(key=lambda x: x[1], reverse=True)
# Search LeetCode dataset
leetcode_results = search_leetcode(query)
# Combine results
combined_results = [
["Apify Results", "Relevancy Score"],
*[[f'<a href="{url}" target="_blank">{url}</a>', f'{score:.4f}'] for url, score in scored_results],
["LeetCode Results", "Similarity Score"],
*[["LeetCode: " + row['title'], f"{row['similarity']:.4f}"] for _, row in leetcode_results.iterrows()]
]
return combined_results
# Wrapper function to run async function in sync context
def search_and_score_wrapper(query):
return asyncio.run(search_and_score(query))
# Create Gradio interface
iface = gr.Interface(
fn=search_and_score_wrapper,
inputs=gr.Textbox(lines=2, placeholder="Enter your search query here..."),
outputs=gr.Dataframe(headers=["Result", "Score"], datatype=["html", "number"]),
title="Search Relevancy Scorer with RAG",
description="Enter a search query to get relevant results from web search and LeetCode dataset."
)
# Launch the interface
iface.launch()