Spaces:
Runtime error
Runtime error
Commit
·
c1ead4a
1
Parent(s):
6023585
fix prompt mistakes
Browse files
sheet.py
CHANGED
|
@@ -90,7 +90,7 @@ def test_get_condensed_result():
|
|
| 90 |
res = get_serp(query)
|
| 91 |
cond_res = get_condensed_result(res)
|
| 92 |
|
| 93 |
-
def compose_analysis( client, query, search_results, model: str = 'gpt-3.5-turbo-0125'):
|
| 94 |
"""
|
| 95 |
Argument
|
| 96 |
query: str
|
|
@@ -103,9 +103,9 @@ def compose_analysis( client, query, search_results, model: str = 'gpt-3.5-turbo
|
|
| 103 |
messages=[
|
| 104 |
{
|
| 105 |
"role": "system",
|
| 106 |
-
"content": '''
|
| 107 |
As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
|
| 108 |
-
your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be
|
| 109 |
It's very important to omit unrelated results. Do not make up any assumption.
|
| 110 |
Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
|
| 111 |
If no relevant information has been found, simply output json with empty values.
|
|
@@ -366,7 +366,7 @@ def crawl_results_mp( data: pd.DataFrame, crawl_file_path: str, n_processes: int
|
|
| 366 |
print( f"total time: {time.time() - st}")
|
| 367 |
return crawled_results
|
| 368 |
|
| 369 |
-
def extract_results( data: pd.DataFrame ):
|
| 370 |
"""
|
| 371 |
Argument
|
| 372 |
data: `evidence`, `result`
|
|
@@ -384,7 +384,7 @@ def extract_results( data: pd.DataFrame ):
|
|
| 384 |
address = d[6]
|
| 385 |
query = compose_query( address, business_name)
|
| 386 |
try:
|
| 387 |
-
ana_res = compose_analysis( client, query = query, search_results = evidence)
|
| 388 |
ana_res = json.loads(ana_res)
|
| 389 |
except Exception as e:
|
| 390 |
print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}")
|
|
@@ -405,7 +405,7 @@ def extract_results( data: pd.DataFrame ):
|
|
| 405 |
"empty_indices": empty_indices
|
| 406 |
}
|
| 407 |
|
| 408 |
-
def extract_results_mp( crawled_results, extracted_file_path):
|
| 409 |
"""
|
| 410 |
Argument
|
| 411 |
Return
|
|
@@ -417,7 +417,7 @@ def extract_results_mp( crawled_results, extracted_file_path):
|
|
| 417 |
if not os.path.exists(extracted_file_path):
|
| 418 |
split_data = split_dataframe( crawled_results)
|
| 419 |
with mp.Pool(args.n_processes) as pool:
|
| 420 |
-
extracted_results = pool.
|
| 421 |
extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
|
| 422 |
with open( extracted_file_path, "wb") as f:
|
| 423 |
joblib.dump( extracted_results, f)
|
|
@@ -630,7 +630,7 @@ category2supercategory = {
|
|
| 630 |
"西餐廳(含美式,義式,墨式)": "西式",
|
| 631 |
"中式": "中式",
|
| 632 |
"西式": "西式",
|
| 633 |
-
"
|
| 634 |
"西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
|
| 635 |
"早餐": ""
|
| 636 |
}
|
|
|
|
| 90 |
res = get_serp(query)
|
| 91 |
cond_res = get_condensed_result(res)
|
| 92 |
|
| 93 |
+
def compose_analysis( client, query, search_results, classes: list, model: str = 'gpt-3.5-turbo-0125'):
|
| 94 |
"""
|
| 95 |
Argument
|
| 96 |
query: str
|
|
|
|
| 103 |
messages=[
|
| 104 |
{
|
| 105 |
"role": "system",
|
| 106 |
+
"content": f'''
|
| 107 |
As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
|
| 108 |
+
your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be {",".join("`"+x+"`" for x in classes)}.
|
| 109 |
It's very important to omit unrelated results. Do not make up any assumption.
|
| 110 |
Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
|
| 111 |
If no relevant information has been found, simply output json with empty values.
|
|
|
|
| 366 |
print( f"total time: {time.time() - st}")
|
| 367 |
return crawled_results
|
| 368 |
|
| 369 |
+
def extract_results( data: pd.DataFrame, classes: list ):
|
| 370 |
"""
|
| 371 |
Argument
|
| 372 |
data: `evidence`, `result`
|
|
|
|
| 384 |
address = d[6]
|
| 385 |
query = compose_query( address, business_name)
|
| 386 |
try:
|
| 387 |
+
ana_res = compose_analysis( client, query = query, search_results = evidence, classes = classes)
|
| 388 |
ana_res = json.loads(ana_res)
|
| 389 |
except Exception as e:
|
| 390 |
print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}")
|
|
|
|
| 405 |
"empty_indices": empty_indices
|
| 406 |
}
|
| 407 |
|
| 408 |
+
def extract_results_mp( crawled_results, extracted_file_path, classes: list):
|
| 409 |
"""
|
| 410 |
Argument
|
| 411 |
Return
|
|
|
|
| 417 |
if not os.path.exists(extracted_file_path):
|
| 418 |
split_data = split_dataframe( crawled_results)
|
| 419 |
with mp.Pool(args.n_processes) as pool:
|
| 420 |
+
extracted_results = pool.starmap( extract_results, [ (x, classes) for x in split_data])
|
| 421 |
extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
|
| 422 |
with open( extracted_file_path, "wb") as f:
|
| 423 |
joblib.dump( extracted_results, f)
|
|
|
|
| 630 |
"西餐廳(含美式,義式,墨式)": "西式",
|
| 631 |
"中式": "中式",
|
| 632 |
"西式": "西式",
|
| 633 |
+
"西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)": "西式",
|
| 634 |
"西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
|
| 635 |
"早餐": ""
|
| 636 |
}
|