Spaces:

tomascufaro
/

keyword_classification

Build error

App Files Files Community

keyword_classification / app.py

tomascufaro

collections

8fd5591 about 2 years ago

raw

history blame contribute delete

5.51 kB

	import pandas as pd # for data manipulation (pip install pandas)
	from langchain.chat_models import ChatOpenAI
	from langchain.chains import create_extraction_chain
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts import ChatPromptTemplate
	import gradio as gr
	import os
	import collections

	# Schema
	schema = {
	"properties": {
	"keyword": {"type": "string"},
	"category": {"type": "string"},
	},
	"required": ["keyword", "category"],
	}

	# Input
	prompt = ChatPromptTemplate.from_messages(
	[
	("system", "You are an expert marketing researcher"),
	("human", """{prompt_input}.
	Here you have the categories splitted by coma: {categories}.
	and Here you have the keywords splitted by coma: {keywords}."""),
	("human", "Tip: Make sure to answer in the correct format and DO NOT leave keywords without category and DO NOT skip keywords. Please categorize all the keywords that I give you, each keyword must have just one and only one category."),
	]
	)

	prompt_no_cat = ChatPromptTemplate.from_messages(
	[
	("system", "You are an expert marketing researcher"),
	("human", """{prompt_input}.
	and Here you have the keywords splitted by coma: {keywords}."""),
	("human", "Tip: Make sure to answer in the correct format and DO NOT leave keywords without category and DO NOT skip keywords. Please categorize all the keywords that I give you, each keyword must have just one and only one category."),
	]
	)

	llm = ChatOpenAI(temperature=0, openai_api_key=os.getenv("OpenAI_APIKEY"), model="gpt-3.5-turbo")
	chain = create_extraction_chain(schema, llm, prompt, verbose=1)
	chain_no_cat = create_extraction_chain(schema, llm, prompt_no_cat, verbose=1)

	def run_chain(input_prompt, keywords_file, categories_file=None, batch_size=50):
	results = []
	batch_size = batch_size
	index = 0
	try:
	keywords = pd.read_csv(keywords_file.name)
	except:
	keywords = pd.read_excel(keywords_file.name)
	if categories_file != None:
	try:
	categories = pd.read_csv(categories_file.name)
	except:
	categories = pd.read_excel(categories_file.name)
	categories = list(categories[categories.columns[0]].values)
	keywords = list(keywords[keywords.columns[0]].values)
	while index < len(keywords):
	try:
	batch = keywords[index:index+batch_size]
	except:
	batch = keywords[index:]
	try:
	result = chain.run({'prompt_input':input_prompt, 'categories':','.join(categories), 'keywords':','.join(batch)})
	except Exception as E:
	print('this batch did not worked from {} to {}'.format(index, index + batch_size))
	print(E)
	result = []
	results += result
	index += batch_size
	results_to_csv(results)
	#print((index, batch_size, len(keywords)))
	return results, 'themes_results.csv'
	else:
	keywords = list(keywords[keywords.columns[0]].values)
	batch_size = len(keywords)
	while index < len(keywords):
	try:
	batch = keywords[index:index+batch_size]
	except:
	batch = keywords[index:]
	try:
	result = chain_no_cat.run({'prompt_input':input_prompt, 'keywords':','.join(batch)})
	except Exception as E:
	print('this batch did not worked from {} to {}'.format(index, index + batch_size))
	print(E)
	result = []
	results += result
	index += batch_size
	results_to_csv(results)
	#print((index, batch_size, len(keywords)))
	return results, 'themes_results.csv'

	def results_to_csv(results):
	super_dict = collections.defaultdict(list)
	for d in results:
	for k, v in d.items(): # d.items() in Python 3+
	super_dict[k].append(v)
	pd.DataFrame(super_dict).to_csv('themes_results.csv', index=False)


	with gr.Blocks() as demo:
	prompt_input = gr.Text("""I need your help to analyze and categorize the provided list of keywords
	into the appropriate categories.
	The goal is to understand information demand on search engines within this industry. Each keyword represents a search and it should have a relation with the category.
	Extract each keyword and assign the best category among the given categories. Return every keyword with the relative category in pairs.
	If the categories are not given """)
	gr.Markdown("Upload CSV or xlsx with keywords: Just a csv with all the keywords in one column. Should have a header")
	keywords_file = gr.File(file_types=['csv', 'xlsx'], label='keywords')
	gr.Markdown("Upload CSV or xlsx with categories: Just a csv with all the keywords in one column. Should have a header")
	categories_file = gr.File(file_types=['.csv', '.xlsx'], label='categories')
	btn = gr.Button(value="Run with categories")
	btn2 = gr.Button(value="Run without categories")
	txt_3 = gr.Textbox(value="", label="Output")
	output_file = gr.File(label="Output File",
	file_count="single",
	file_types=["", ".", ".csv",".xls",".xlsx"])

	btn.click(run_chain, inputs=[prompt_input, keywords_file, categories_file], outputs=[txt_3, output_file])
	btn2.click(run_chain, inputs=[prompt_input, keywords_file], outputs=[txt_3, output_file])
	demo.launch()