Spaces:

jaothan
/

crunchbase_test1

Sleeping

File size: 2,809 Bytes

06f7804

import pandas as pd
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
import pinecone

# Initialize Pinecone
PINECONE_API_KEY = "your-pinecone-api-key"  # Replace with your Pinecone API key
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
index_name = 'company-recommendations'

# Load the dataset (replace with your dataset)
def load_data():
    # Example dataset with company descriptions and regions
    data = pd.read_csv('company_data.csv')  # Replace with your dataset
    data["id"] = range(len(data))
    return data

# Generate embeddings and upload to Pinecone
def prepare_and_upload_data(data):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight model for embeddings
    print("Encoding company descriptions...")
    encoded_descriptions = model.encode(data['description'])
    data['description_vector'] = pd.Series(encoded_descriptions.tolist())

    print("Uploading items to Pinecone...")
    items_to_upload = [(str(row.id), row.description_vector, {"region": row.region}) for _, row in data.iterrows()]
    for i in range(0, len(items_to_upload), 500):  # Batch size of 500
        pc.Index(index_name).upsert(vectors=items_to_upload[i:i+500])

# Query Pinecone for top 5 matching companies in a specific region
def get_top_companies(description, region, top_k=5):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_vector = model.encode(description)

    # Query Pinecone with region filter
    res = pc.Index(index_name).query(
        vector=query_vector,
        top_k=top_k,
        filter={"region": region}
    )

    # Extract results
    ids = [match.id for match in res.matches]
    scores = [match.score for match in res.matches]
    df = pd.DataFrame({
        'id': ids,
        'score': scores,
        'name': [data.loc[int(_id), 'name'] for _id in ids],
        'description': [data.loc[int(_id), 'description'] for _id in ids],
        'region': [data.loc[int(_id), 'region'] for _id in ids]
    })
    return df

# Gradio Interface
def gradio_interface(description, region):
    data = load_data()
    prepare_and_upload_data(data)
    top_companies = get_top_companies(description, region)
    return top_companies

# Launch Gradio App
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Enter your company services description"),
        gr.Dropdown(["North America", "Europe", "Asia", "South America", "Africa", "Australia"], label="Select Region")
    ],
    outputs=gr.Dataframe(label="Top 5 Matching Companies"),
    title="Company Recommendation Engine",
    description="Enter your company services description and select a region to find the top 5 matching companies."
)

iface.launch()