crunchbase_test1 / app4.py
jaothan's picture
Rename app.py to app4.py
02e9f52 verified
import pandas as pd
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
import pinecone
# Initialize Pinecone
PINECONE_API_KEY = "your-pinecone-api-key" # Replace with your Pinecone API key
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
index_name = 'company-recommendations'
# Load the dataset (replace with your dataset)
def load_data():
# Example dataset with company descriptions and regions
data = pd.read_csv('company_data.csv') # Replace with your dataset
data["id"] = range(len(data))
return data
# Generate embeddings and upload to Pinecone
def prepare_and_upload_data(data):
model = SentenceTransformer('all-MiniLM-L6-v2') # Lightweight model for embeddings
print("Encoding company descriptions...")
encoded_descriptions = model.encode(data['description'])
data['description_vector'] = pd.Series(encoded_descriptions.tolist())
print("Uploading items to Pinecone...")
items_to_upload = [(str(row.id), row.description_vector, {"region": row.region}) for _, row in data.iterrows()]
for i in range(0, len(items_to_upload), 500): # Batch size of 500
pc.Index(index_name).upsert(vectors=items_to_upload[i:i+500])
# Query Pinecone for top 5 matching companies in a specific region
def get_top_companies(description, region, top_k=5):
model = SentenceTransformer('all-MiniLM-L6-v2')
query_vector = model.encode(description)
# Query Pinecone with region filter
res = pc.Index(index_name).query(
vector=query_vector,
top_k=top_k,
filter={"region": region}
)
# Extract results
ids = [match.id for match in res.matches]
scores = [match.score for match in res.matches]
df = pd.DataFrame({
'id': ids,
'score': scores,
'name': [data.loc[int(_id), 'name'] for _id in ids],
'description': [data.loc[int(_id), 'description'] for _id in ids],
'region': [data.loc[int(_id), 'region'] for _id in ids]
})
return df
# Gradio Interface
def gradio_interface(description, region):
data = load_data()
prepare_and_upload_data(data)
top_companies = get_top_companies(description, region)
return top_companies
# Launch Gradio App
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(label="Enter your company services description"),
gr.Dropdown(["North America", "Europe", "Asia", "South America", "Africa", "Australia"], label="Select Region")
],
outputs=gr.Dataframe(label="Top 5 Matching Companies"),
title="Company Recommendation Engine",
description="Enter your company services description and select a region to find the top 5 matching companies."
)
iface.launch()