Spaces:
Runtime error
Runtime error
Commit
·
7cfd43a
1
Parent(s):
c1ead4a
fix wrong prompt format
Browse files
app.py
CHANGED
|
@@ -17,6 +17,7 @@ load_dotenv()
|
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
logger.setLevel(logging.DEBUG)
|
| 19 |
|
|
|
|
| 20 |
|
| 21 |
def plot_wordcloud( text):
|
| 22 |
"""
|
|
@@ -71,7 +72,7 @@ def do( business_id, business_name, address):
|
|
| 71 |
|
| 72 |
crawled_results = pd.DataFrame(crawled_results)
|
| 73 |
# logger.debug(crawled_results)
|
| 74 |
-
extracted_results = extract_results( crawled_results)
|
| 75 |
# logger.error(extracted_results['extracted_results'].columns)
|
| 76 |
extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name'] ]
|
| 77 |
|
|
|
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
logger.setLevel(logging.DEBUG)
|
| 19 |
|
| 20 |
+
classes = list([ x for x in category2supercategory.keys() if len(x)>0])
|
| 21 |
|
| 22 |
def plot_wordcloud( text):
|
| 23 |
"""
|
|
|
|
| 72 |
|
| 73 |
crawled_results = pd.DataFrame(crawled_results)
|
| 74 |
# logger.debug(crawled_results)
|
| 75 |
+
extracted_results = extract_results( crawled_results, classes=classes)
|
| 76 |
# logger.error(extracted_results['extracted_results'].columns)
|
| 77 |
extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name'] ]
|
| 78 |
|
sheet.py
CHANGED
|
@@ -99,18 +99,22 @@ def compose_analysis( client, query, search_results, classes: list, model: str =
|
|
| 99 |
Return
|
| 100 |
response: str
|
| 101 |
"""
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
"role": "system",
|
| 106 |
-
"content": f'''
|
| 107 |
As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
|
| 108 |
-
your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be
|
| 109 |
It's very important to omit unrelated results. Do not make up any assumption.
|
| 110 |
Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
|
| 111 |
If no relevant information has been found, simply output json with empty values.
|
| 112 |
I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
|
| 113 |
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
},
|
| 115 |
{
|
| 116 |
"role": "user",
|
|
@@ -150,7 +154,7 @@ def test_compose_analysis():
|
|
| 150 |
def compose_classication(
|
| 151 |
client,
|
| 152 |
evidence,
|
| 153 |
-
classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'],
|
| 154 |
backup_classes: list = [ '中式', '西式'],
|
| 155 |
model: str = 'gpt-3.5-turbo-0125'
|
| 156 |
) -> str:
|
|
@@ -382,12 +386,13 @@ def extract_results( data: pd.DataFrame, classes: list ):
|
|
| 382 |
business_id = d[2]
|
| 383 |
business_name = d[3]
|
| 384 |
address = d[6]
|
|
|
|
| 385 |
query = compose_query( address, business_name)
|
| 386 |
try:
|
| 387 |
ana_res = compose_analysis( client, query = query, search_results = evidence, classes = classes)
|
| 388 |
ana_res = json.loads(ana_res)
|
| 389 |
except Exception as e:
|
| 390 |
-
print(f"# ANALYSIS error {e}: i = {i},
|
| 391 |
empty_indices.append(i)
|
| 392 |
continue
|
| 393 |
|
|
@@ -630,8 +635,8 @@ category2supercategory = {
|
|
| 630 |
"西餐廳(含美式,義式,墨式)": "西式",
|
| 631 |
"中式": "中式",
|
| 632 |
"西式": "西式",
|
| 633 |
-
"
|
| 634 |
-
"
|
| 635 |
"早餐": ""
|
| 636 |
}
|
| 637 |
|
|
@@ -647,7 +652,7 @@ supercategory2category = {
|
|
| 647 |
"燒烤",
|
| 648 |
"韓式料理(含火鍋,烤肉)"
|
| 649 |
],
|
| 650 |
-
"西式": ["西餐廳(含美式,義式,墨式)", "
|
| 651 |
"": ["早餐"]
|
| 652 |
}
|
| 653 |
|
|
@@ -671,7 +676,7 @@ if __name__=='__main__':
|
|
| 671 |
parser.add_argument("--combined_file_path", type=str, default="data/gpt3.5/combined_results.joblib")
|
| 672 |
parser.add_argument("--postprocessed_results", type=str, default="data/gpt3.5/postprocessed_results.joblib")
|
| 673 |
parser.add_argument("--formatted_results", type=str, default="data/gpt3.5/formatted_results.csv")
|
| 674 |
-
parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'])
|
| 675 |
parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
|
| 676 |
parser.add_argument("--strategy", type=str, default='replace', choices=['replace', 'patch'])
|
| 677 |
parser.add_argument("--n_processes", type=int, default=4)
|
|
|
|
| 99 |
Return
|
| 100 |
response: str
|
| 101 |
"""
|
| 102 |
+
categories = ", ".join([ "`"+x+"`" for x in classes if x!='早餐' ])+ " or " + "`早餐`"
|
| 103 |
+
# print(f"categoreis: {categories}")
|
| 104 |
+
system_prompt = '''
|
|
|
|
|
|
|
| 105 |
As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
|
| 106 |
+
your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be `小吃店`, `日式料理(含居酒屋,串燒)`, `火(鍋/爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`, `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋,烤肉)`, `西餐廳(含美式,義式,墨式)`, `西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)`, `西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)` or `早餐`.
|
| 107 |
It's very important to omit unrelated results. Do not make up any assumption.
|
| 108 |
Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
|
| 109 |
If no relevant information has been found, simply output json with empty values.
|
| 110 |
I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
|
| 111 |
'''
|
| 112 |
+
# print(f"system prompt = {system_prompt}")
|
| 113 |
+
chat_completion = client.chat.completions.create(
|
| 114 |
+
messages=[
|
| 115 |
+
{
|
| 116 |
+
"role": "system",
|
| 117 |
+
"content": system_prompt
|
| 118 |
},
|
| 119 |
{
|
| 120 |
"role": "user",
|
|
|
|
| 154 |
def compose_classication(
|
| 155 |
client,
|
| 156 |
evidence,
|
| 157 |
+
classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', ],
|
| 158 |
backup_classes: list = [ '中式', '西式'],
|
| 159 |
model: str = 'gpt-3.5-turbo-0125'
|
| 160 |
) -> str:
|
|
|
|
| 386 |
business_id = d[2]
|
| 387 |
business_name = d[3]
|
| 388 |
address = d[6]
|
| 389 |
+
ana_res = None
|
| 390 |
query = compose_query( address, business_name)
|
| 391 |
try:
|
| 392 |
ana_res = compose_analysis( client, query = query, search_results = evidence, classes = classes)
|
| 393 |
ana_res = json.loads(ana_res)
|
| 394 |
except Exception as e:
|
| 395 |
+
print(f"# ANALYSIS error {e}: i = {i}, ana_res = {ana_res}")
|
| 396 |
empty_indices.append(i)
|
| 397 |
continue
|
| 398 |
|
|
|
|
| 635 |
"西餐廳(含美式,義式,墨式)": "西式",
|
| 636 |
"中式": "中式",
|
| 637 |
"西式": "西式",
|
| 638 |
+
"西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)": "西式",
|
| 639 |
+
"西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
|
| 640 |
"早餐": ""
|
| 641 |
}
|
| 642 |
|
|
|
|
| 652 |
"燒烤",
|
| 653 |
"韓式料理(含火鍋,烤肉)"
|
| 654 |
],
|
| 655 |
+
"西式": ["西餐廳(含美式,義式,墨式)", "西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)", "西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)"],
|
| 656 |
"": ["早餐"]
|
| 657 |
}
|
| 658 |
|
|
|
|
| 676 |
parser.add_argument("--combined_file_path", type=str, default="data/gpt3.5/combined_results.joblib")
|
| 677 |
parser.add_argument("--postprocessed_results", type=str, default="data/gpt3.5/postprocessed_results.joblib")
|
| 678 |
parser.add_argument("--formatted_results", type=str, default="data/gpt3.5/formatted_results.csv")
|
| 679 |
+
parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', '西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)', '西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)', '早餐'])
|
| 680 |
parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
|
| 681 |
parser.add_argument("--strategy", type=str, default='replace', choices=['replace', 'patch'])
|
| 682 |
parser.add_argument("--n_processes", type=int, default=4)
|