Spaces:

jaothan
/

crunchbase_test1

Sleeping

App Files Files Community

crunchbase_test1 / app4.py

jaothan

Rename app.py to app4.py

02e9f52 verified 12 months ago

raw

history blame contribute delete

2.81 kB

	import pandas as pd
	import numpy as np
	import gradio as gr
	from sentence_transformers import SentenceTransformer
	import pinecone

	# Initialize Pinecone
	PINECONE_API_KEY = "your-pinecone-api-key" # Replace with your Pinecone API key
	pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
	index_name = 'company-recommendations'

	# Load the dataset (replace with your dataset)
	def load_data():
	# Example dataset with company descriptions and regions
	data = pd.read_csv('company_data.csv') # Replace with your dataset
	data["id"] = range(len(data))
	return data

	# Generate embeddings and upload to Pinecone
	def prepare_and_upload_data(data):
	model = SentenceTransformer('all-MiniLM-L6-v2') # Lightweight model for embeddings
	print("Encoding company descriptions...")
	encoded_descriptions = model.encode(data['description'])
	data['description_vector'] = pd.Series(encoded_descriptions.tolist())

	print("Uploading items to Pinecone...")
	items_to_upload = [(str(row.id), row.description_vector, {"region": row.region}) for _, row in data.iterrows()]
	for i in range(0, len(items_to_upload), 500): # Batch size of 500
	pc.Index(index_name).upsert(vectors=items_to_upload[i:i+500])

	# Query Pinecone for top 5 matching companies in a specific region
	def get_top_companies(description, region, top_k=5):
	model = SentenceTransformer('all-MiniLM-L6-v2')
	query_vector = model.encode(description)

	# Query Pinecone with region filter
	res = pc.Index(index_name).query(
	vector=query_vector,
	top_k=top_k,
	filter={"region": region}
	)

	# Extract results
	ids = [match.id for match in res.matches]
	scores = [match.score for match in res.matches]
	df = pd.DataFrame({
	'id': ids,
	'score': scores,
	'name': [data.loc[int(_id), 'name'] for _id in ids],
	'description': [data.loc[int(_id), 'description'] for _id in ids],
	'region': [data.loc[int(_id), 'region'] for _id in ids]
	})
	return df

	# Gradio Interface
	def gradio_interface(description, region):
	data = load_data()
	prepare_and_upload_data(data)
	top_companies = get_top_companies(description, region)
	return top_companies

	# Launch Gradio App
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.Textbox(label="Enter your company services description"),
	gr.Dropdown(["North America", "Europe", "Asia", "South America", "Africa", "Australia"], label="Select Region")
	],
	outputs=gr.Dataframe(label="Top 5 Matching Companies"),
	title="Company Recommendation Engine",
	description="Enter your company services description and select a region to find the top 5 matching companies."
	)

	iface.launch()