Spaces:

linpershey
/

sheetbot

Runtime error

App Files Files Community

linpershey commited on Apr 28, 2024

Commit

4925baf

1 Parent(s): d8e228f

remove from lfs and add back

Browse files

Files changed (2) hide show

app.py +112 -3
sheet.py +671 -3

app.py CHANGED Viewed

@@ -1,3 +1,112 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1361c1e04f9071fadb5137915387ace77713c1d919db55da04e338b4d69eee35
-size 3816

+import os
+import logging
+import gradio as gr
+import pandas as pd
+from dotenv import load_dotenv
+import jieba
+jieba.cut('你好')
+from wordcloud import WordCloud
+from PIL import Image
+import matplotlib.pyplot as plt
+from sheet import compose_query, get_serp, get_condensed_result, extract_results, postprocess_result, format_output, category2supercategory
+load_dotenv()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+def plot_wordcloud( text):
+    """
+    """
+    if os.getenv("FONT_PATH", None) is not None:
+        wc_generator = WordCloud(font_path=os.getenv("FONT_PATH"))
+    else:
+        wc_generator = WordCloud()
+    img = wc_generator.generate( " ".join(jieba.cut(text)))
+    # fig, ax = plt.subplots()
+    # ax.imshow(wordcloud, interpolation='bilinear')
+    # ax.axis("off")
+    return img.to_image()
+def format_category( formatted_results):
+    """
+    """
+    return "\n\n".join([
+        f"> 大類別：{formatted_results['supercategory'].values[0]}",
+        f"> 小類別：{formatted_results['category'].values[0]}",
+        f"> 商家名稱：{formatted_results['store_name'].values[0]}",
+        f"> 電話：{formatted_results['phone_number'].values[0]}",
+        f"> 描述：{formatted_results['description'].values[0]}"
+    ])
+def do( business_id, business_name, address):
+    """
+    """
+    crawled_results = []
+    google_domain = "google.com.tw"
+    gl = 'tw'
+    lr  = 'lang_zh-TW'
+    query = compose_query(address, business_name)
+    try:
+        res = get_serp( query, google_domain, gl, lr)
+    except Exception as e:
+        return f"Error: {e}"
+    cond_res = get_condensed_result(res)
+    crawled_results.append( {
+        "index": 0,
+        "business_id": business_id,
+        "business_name": business_name,
+        "serp": res,
+        "evidence": cond_res,
+        "address": address
+    } )
+    crawled_results = pd.DataFrame(crawled_results)
+    # logger.debug(crawled_results)
+    extracted_results = extract_results( crawled_results)
+    # logger.error(extracted_results['extracted_results'].columns)
+    extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name'] ]
+    postprocessed_results = postprocess_result( extracted_results, postprocessed_results_path="/tmp/postprocessed_results.joblib", category_hierarchy=category2supercategory)
+    os.remove("/tmp/postprocessed_results.joblib")
+    formatted_results = format_output( postprocessed_results)
+    # logger.error( formatted_results.columns)
+    formatted_output = format_category( formatted_results)
+    img = plot_wordcloud(formatted_results['formatted_evidence'].values[0])
+    return formatted_results['formatted_evidence'].values[0], img, formatted_output
+## --- interface --- ##
+# outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
+# demo = gr.Interface(
+#         fn=do,
+#         inputs=[ "text", "text", "text"],
+#         outputs=outputs,
+#     )
+## --- block --- ##
+with gr.Blocks() as demo:
+    gr.Markdown("🌟 自動分類餐廳型態 🌟")
+    with gr.Row():
+        inputs = [ gr.Textbox( label="統一編號", placeholder="統一編號"), gr.Textbox(placeholder="商家名稱"), gr.Textbox(placeholder="地址")]
+    with gr.Row():
+        # outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
+        outputs = [ gr.Markdown( label="參考資料（google search）"), gr.Image( label="文字雲"), gr.Markdown( label="類別", )]
+    btn = gr.Button("Submit")
+    btn.click(fn=do, inputs=inputs, outputs=outputs)
+if __name__ == "__main__":
+    demo.launch(share=True, auth=("kota", "kota"))

sheet.py CHANGED Viewed

@@ -1,3 +1,671 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:01d56594e5b1014193942ccac5bed55f04a0927aece2617172fabff1794745ad
-size 30077

+import os
+import time
+import json
+import joblib
+import math
+import itertools
+import argparse
+import multiprocessing as mp
+import pandas as pd
+from dotenv import load_dotenv
+from serpapi import GoogleSearch
+import tiktoken
+from openai import OpenAI
+from tqdm import tqdm
+load_dotenv()
+ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID')
+SERP_API_KEY = os.getenv('SERP_APIKEY')
+def get_leads( file_path: str, names: list = ['營業地址', '統一編號', '總機構統一編號', '營業人名稱', '資本額', '設立日期', '組織別名稱', '使用統一發票',
+       '行業代號', '名稱', '行業代號1', '名稱1', '行業代號2', '名稱2', '行業代號3', '名稱3']):
+    """
+    """
+    assert os.path.exists(file_path)
+    data = pd.read_csv( file_path, names=names)
+    return data
+def get_serp( query: str, google_domain: str, gl: str, lr: str) -> dict:
+    """
+    """
+    results = []
+    search = GoogleSearch({
+        "q": query,
+        'google_domain': google_domain,
+        'gl': gl,
+        'lr': lr,
+        "api_key": SERP_API_KEY
+      })
+    result = search.get_dict()
+    # print(result['organic_results'][0])
+    # return result['organic_results'][0]
+    return result
+def test_get_serp():
+    # query = "原味商行"
+    # query = "南投縣中寮鄉中寮村鄉林巷４３號 和興商店"
+    # query = "啓輝環管企業社"
+    # query = "蘭陽客棧小吃店"
+    # query = '韓笑味食品有限公司'
+    # query = '小阿姨的店'
+    query = '達米娜魚料理店'
+    res = get_serp(query, google_domain='google.com.tw')
+    print(res)
+def get_condensed_result(result):
+    """
+    Argument
+        result
+    Return
+        condensed_result:
+    Example:
+        result['knowledge_graph'].keys() # 'title', 'thumbnail', 'type', 'entity_type', 'kgmid', 'knowledge_graph_search_link', 'serpapi_knowledge_graph_search_link', 'tabs', 'place_id', 'directions', 'local_map', 'rating', 'review_count', '服務項目', '地址', '地址_links', 'raw_hours', 'hours', '電話號碼', '電話號碼_links', 'popular_times', 'user_reviews', 'reviews_from_the_web', 'unclaimed_listing', '個人資料', '其他人也搜尋了以下項目', '其他人也搜尋了以下項目_link', '其他人也搜尋了以下項目_stick'
+    """
+    filtered_results = [
+        {"title": r.get('title',""), 'snippet': r.get('snippet',"")} for r in result['organic_results']
+    ]
+    if 'knowledge_graph' in result:
+        if 'user_reviews' in result['knowledge_graph']:
+            filtered_results.append( {'title': result['knowledge_graph']['title'], '顧客評價': "\t".join([ _.get('summary', '') for _ in result['knowledge_graph']['user_reviews']]) })
+        if '其他人也搜尋了以下項目' in result['knowledge_graph']:
+            filtered_results.append( {'title': "類似的店", 'snippet': "\t".join([ str(_.get('extensions', '')) for _ in result['knowledge_graph']['其他人也搜尋了以下項目']]) })
+        if '暫停營業' in result['knowledge_graph']:
+            filtered_results.append( {'status': '暫停營業' if result['knowledge_graph']['暫停營業'] else '營業中'})
+        if '電話號碼' in result['knowledge_graph']:
+            filtered_results.append( {'telephone_number': result['knowledge_graph']['電話號碼']})
+    condensed_result = json.dumps(filtered_results, ensure_ascii=False)
+    # print( condensed_results )
+    return condensed_result
+def test_get_condensed_result():
+    # query = "原味商行"
+    # query = "南投縣中寮鄉中寮村鄉林巷４３號 和興商店"
+    # query = "啓輝環管企業社"
+    # query = "蘭陽客棧小吃店"
+    # query = '韓笑味食品有限公司'
+    # query = '小阿姨的店'
+    query = '達米娜魚料理店'
+    res = get_serp(query)
+    cond_res = get_condensed_result(res)
+def compose_analysis( client, query, search_results):
+    """
+    Argument
+        query: str
+        search_results: str
+    Return
+        response: str
+    """
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "system",
+                "content": '''
+                    As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
+                    your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be `小吃店`, `日式料理(含居酒屋，串燒)`, `火(鍋／爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`,  `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋，烤肉)` or `西餐廳(含美式，義式，��式)`.
+                    It's very important to omit unrelated results. Do not make up any assumption.
+                    Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
+                    If no relevant information has been found, simply output json with empty values.
+                    I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
+                '''
+            },
+            {
+                "role": "user",
+                "content": f'''
+                    `query`: `{query}`,
+                    `search_results`: {search_results}
+                ''',
+            }
+        ],
+        model = "gpt-4-0125-preview",
+        response_format = {"type": "json_object"},
+        temperature = 0,
+        # stream = True
+    )
+#     response = []
+#     for chunk in chat_completion:
+#         text = chunk.choices[0].delta.content or ""
+#         response.append(text)
+#         print( text, end="")
+#     return "".join(response)
+    response = chat_completion.choices[0].message.content
+    return response
+def test_compose_analysis():
+    # query = "原味商行"
+    # query = "南投縣中寮鄉中寮村鄉林巷４３號 和興商店"
+    # query = "啓輝環管企業社"
+    # query = "蘭陽客棧小吃店"
+    # query = '韓笑味食品有限公司'
+    # query = '小阿姨的店'
+    query = '達米娜魚料理店'
+    res = get_serp(query)
+    cond_res = get_condensed_result(res)
+    resp = compose_analysis( client, query = query, search_results = cond_res)
+    print( resp )
+def compose_classication(
+        client,
+        evidence,
+        classes: list = ['小吃店', '日式料理(含居酒屋，串燒)', '火(鍋／爐)', '東南亞料理(不含日韓)', '海鮮熱炒',  '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋，烤肉)', '西餐廳(含美式，義式，墨式)'],
+        backup_classes: list = [ '中式', '西式'],
+    ) -> str:
+    """
+    Argument
+        client:
+        evidence: str
+        classes: list
+    Return
+        response: str
+    """
+    if isinstance(classes, list):
+        classes = ", ".join([ f"`{x}`" for x in classes])
+    elif isinstance(classes, str):
+        pass
+    else:
+        raise Exception(f"Incorrect classes type: {type(classes)}")
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "system",
+                "content": f'''
+                    As a helpful and rigorous retail analyst, given the provided information about a store,
+                    your task is two-fold. First, classify provided evidence below into the mostly relevant category from the following: {classes}.
+                    Second, if no relevant information has been found, classify the evidence into the mostly relevant supercategory from the following: {backup_classes}.
+                    It's very important to omit unrelated piece of evidence and don't make up any assumption.
+                    Please think step by step, and output in json format. An example output json is like {{"category": "..."}}
+                    If no relevant piece of information can ever be found at all, simply output json with empty string "".
+                    I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
+                '''
+            },
+            {
+                "role": "user",
+                "content": f'''
+                    `evidence`: `{evidence}`
+                ''',
+            }
+        ],
+        model = "gpt-4-0125-preview",
+        response_format = {"type": "json_object"},
+        temperature = 0,
+        # stream = True
+    )
+    response = chat_completion.choices[0].message.content
+    return response
+def test_compose_classification( evidence):
+    """
+    """
+    evidence = '[{"title": "年年有魚餐飲有限公司- 店家介紹", "snippet": "統一編號. 93769370 · 公司狀況. 營業中 · 公司名稱. 年年有魚餐飲有限公司 · 公司類型. 有限公司 · 資本總額. 6000000 · 所在地. 臺中市西區民龍里臺灣大道2段159號1樓."}, {"title": "年年有魚餐飲有限公司", "snippet": "營業地址, 臺中市西區民龍里臺灣大道2段159號1樓 ; 統編, 93769370 ; 營業名稱, 年年有魚餐飲有限公司 ; 資本額, 6,000,000 ; 設立日期, 1120713."}, {"title": "年年有魚餐飲有限公司", "snippet": "公司名稱, 年年有魚餐飲有限公司 ; 資本總額(元), 6,000,000 ; 負責人, 江敏 ; 登記地址, 看地圖 臺中市西區民龍里臺灣大道二段159號1樓 郵遞區號查詢 ; 設立 ..."}, {"title": "年年有魚餐飲有限公司", "snippet": "年年有魚餐飲有限公司 ; 負責人, 江敏 ; 登記地址, 台中市西區民龍��台灣大道二段159號1樓 ; 公司狀態, 核准設立 ; 資本額, 6,000,000元 ; 所在縣市, 台中市 西區 民龍里."}, {"title": "江_敏－年年有魚餐飲有限公司", "snippet": "負責人:江_敏·公司名:年年有魚餐飲有限公司·統一編號:93769370·公司地址:臺中市西區民龍里臺灣大道二段159號1樓·資本額:6000000·公司狀況:核准設立·核准設立 ..."}, {"title": "年年有魚餐飲有限公司/負責人：江_敏", "snippet": "公司名稱:年年有魚餐飲有限公司·代表人姓名:江_敏·公司所在地:臺中市西區民龍里臺灣大道二段159號1樓·統編:93769370資本總額:6000000·公司狀況:核准設立·核准設立 ..."}, {"title": "貓吃魚餐飲有限公司｜工作徵才簡介", "snippet": "貓吃魚餐飲有限公司. 台中市西屯區. 時薪186元. 應徵人數：1 ~ 5人. 排休; 晚班; 工作經驗不拘; 學歷不拘. 1.佈置及清理餐桌2.為顧客帶位或安排座位3.上菜並提供有關用餐的 ..."}, {"title": "食力餐飲_食力國際有限公司｜公司簡介", "snippet": "「食力國際有限公司」正式成立於2023年4月，目前短短時間已成立了四個品牌～ 一、【食力據點】 1:食力咖哩- 台中遠百店（台中市西屯區臺灣大道三段251號大遠百12樓大食 ..."}, {"title": "112 年臺中市優質餐飲店家分級評核獲獎名單", "snippet": "112 年臺中市優質餐飲店家分級評核獲獎名單-. 臺中市餐廳飲食店低碳認證書20 家. 1 築間幸福鍋物-臺中市政二店臺中市西屯區文心路二段213 號. 2 有之和牛-臺中文心店."}, {"title": "年年有魚水族館", "snippet": "營業地址, 臺中市西屯區何安里西屯路2段101-2號1樓 ; 統編, 21833774 ; 營業名稱, 年年有魚水族館 ; 資本額, 60,000 ; 設立日期, 0940502."}, {"title": "類似的店", "snippet": "[\'設計公司\']\\t[\'餐廳\']"}, {"telephone_number": "04 2376 6318"}]'
+    x = compose_classication( evidence )
+    print( x )
+def classify_results(
+        analysis_results: pd.DataFrame,
+        input_column: str = 'evidence',
+        output_column: str = 'classified_category',
+        classes: list = ['小吃店', '日式料理(含居酒屋，串燒)', '火(鍋／爐)', '東南亞料理(不含日韓)', '海鮮熱炒',  '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋，烤肉)', '西餐廳(含美式，義式，墨式)'],
+        backup_classes: list = [ '中式', '西式']
+    ):
+    """
+    Argument
+        analysis_results: dataframe
+        input_column: str
+        output_column: str
+        classes: list
+    Return
+        analysis_results: dataframe
+    """
+    client = OpenAI( organization = ORGANIZATION_ID)
+    classified_results = analysis_results.copy()
+    empty_indices = []
+    labels = []
+    for idx, evidence in zip( analysis_results['index'], analysis_results[input_column]):
+        try:
+            label = json.loads(compose_classication( client, evidence, classes=classes, backup_classes=backup_classes))['category']
+            labels.append(label)
+        except Exception as e:
+            print(f"# CLASSIFICATION error -> evidence: {e}")
+            labels.append("")
+            empty_indices.append(idx)
+    classified_results[output_column] = labels
+    return {
+        "classified_results": classified_results,
+        "empty_indices": empty_indices
+    }
+def classify_results_mp( extracted_results: pd.DataFrame, classified_file_path, classes, backup_classes, n_processes: int = 4):
+    """
+    Argument
+        extracted_results:
+        classified_file_path:
+        classes: ['小吃店', '日式料理(含居酒屋，串燒)', '火(鍋／爐)', '東南亞料理(不含日韓)', '海鮮熱炒',  '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋，烤肉)', '西餐廳(含美式，義式，墨式)']
+        backup_classes: [ '中式', '西式']
+        n_processes: int
+    Return
+        classified_results: dataframe
+    Reference
+        200 records, 4 processes, 122.4695s
+    """
+    st = time.time()
+    # classified_file_path = "data/classified_result.joblib"
+    if not os.path.exists(classified_file_path):
+        split_data = split_dataframe(extracted_results)
+        with mp.Pool(args.n_processes) as pool:
+            classified_results = pool.starmap(
+                classify_results,
+                [ (
+                    d,
+                    'evidence',
+                    'classified_category',
+                    classes,
+                    backup_classes
+                ) for d in split_data]
+            )
+            classified_results = merge_results( classified_results, dataframe_columns=['classified_results'], list_columns=['empty_indices'])
+            with open( classified_file_path, "wb") as f:
+                joblib.dump( classified_results, f)
+    else:
+        with open( classified_file_path, "rb") as f:
+            classified_results = joblib.load(f)
+    print( f"total time: {time.time() - st}")
+    return classified_results
+def test_get_evidence_classification():
+    analysis_results = classify_results( analysis_results)
+    patch_analysis_results = classify_results( patch_analysis_results)
+def compose_query( address, name, with_index: bool = True):
+    """
+    Argumemnt
+        # d: series with d[1]: 地址, d[4]: 營業人名稱 #
+        address: str
+        name: str
+        with_index: bool
+    Return
+        query: `縣市` `營業人名稱`
+    """
+    # if with_index:  # .itertuples()
+    #     query = f"{d[1][:3]} {d[4]}"
+    # else:
+    #     query = f"{d[0][:3]} {d[3]}"
+    query = f"{address[:3]} {name}"
+    return query
+def crawl_results( data: pd.DataFrame, google_domain: str = 'google.com.tw', gl: str = 'tw', lr: str = 'lang_zh-TW'):
+    """
+    Argument
+        data: dataframe
+        google_domain: str
+        gl: str
+        lr: str
+    Return
+        crawled_results
+    Reference
+        200 records, 4 processes, 171.36490321159363
+    """
+    serp_results = []
+    condensed_results = []
+    crawled_results = []
+    empty_indices = []
+    for i, d in tqdm(enumerate(data.itertuples())):
+        idx = d[0]
+        address = d[1]
+        business_id = d[2]
+        business_name = d[4]
+        query = compose_query(address, business_name)
+        try:
+            res = get_serp( query, google_domain, gl, lr)
+            serp_results.append(res)
+        except:
+            print( f"# SERP error: i = {i}, idx = {idx}, query = {query}")
+            empty_indices.append(i)
+            continue
+        try:
+            cond_res = get_condensed_result(res)
+            condensed_results.append(cond_res)
+        except:
+            print(f"# CONDENSE error: i = {i}, idx = {idx}, res = {res}")
+            empty_indices.append(i)
+            continue
+        crawled_results.append( {
+            "index": idx,
+            "business_id": business_id,
+            "business_name": business_name,
+            "serp": res,
+            "evidence": cond_res,
+            "address": address,
+        } )
+    crawled_results = pd.DataFrame(crawled_results)
+    return {
+        "crawled_results": crawled_results,
+        "empty_indices": empty_indices
+    }
+def crawl_results_mp( data: pd.DataFrame, crawl_file_path: str, n_processes: int = 4):
+    st = time.time()
+    # crawl_file_path = "data/crawled_results.joblib"
+    if not os.path.exists(crawl_file_path):
+        split_data = split_dataframe( data )
+        with mp.Pool(n_processes) as pool:
+            crawled_results = pool.map( crawl_results, split_data)
+            crawled_results = merge_results( crawled_results, dataframe_columns=['crawled_results'], list_columns=['empty_indices'])
+            with open( crawl_file_path, "wb") as f:
+                joblib.dump( crawled_results, f)
+    else:
+        with open( crawl_file_path, "rb") as f:
+            crawled_results = joblib.load(f)
+    print( f"total time: {time.time() - st}")
+    return crawled_results
+def extract_results( data: pd.DataFrame ):
+    """
+    Argument
+        data: `evidence`, `result`
+    Return
+        extracted_results: dataframe of `extracted_evidence`
+    """
+    client = OpenAI( organization = ORGANIZATION_ID)
+    extracted_results = []
+    empty_indices = []
+    for i, d in tqdm(enumerate(data.itertuples())):
+        idx = d[1]
+        evidence = d.evidence
+        business_id = d[2]
+        business_name = d[3]
+        address = d[6]
+        query = compose_query( address, business_name)
+        try:
+            ana_res = compose_analysis( client, query = query, search_results = evidence)
+            ana_res = json.loads(ana_res)
+        except Exception as e:
+            print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}")
+            empty_indices.append(i)
+            continue
+        extracted_results.append( {
+            "index": idx,
+            "business_id": business_id,
+            "business_name": business_name,
+            "evidence": evidence,
+            ** ana_res
+        } )
+    extracted_results = pd.DataFrame(extracted_results)
+    return {
+        "extracted_results": extracted_results,
+        "empty_indices": empty_indices
+    }
+def extract_results_mp( crawled_results, extracted_file_path):
+    """
+    Argument
+    Return
+    Reference
+        200 records, 4 processes, 502.26914715766907
+    """
+    st = time.time()
+    # args.extracted_file_path = "data/extracted_results.joblib"
+    if not os.path.exists(extracted_file_path):
+        split_data = split_dataframe( crawled_results)
+        with mp.Pool(args.n_processes) as pool:
+            extracted_results = pool.map( extract_results, split_data)
+            extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
+            with open( extracted_file_path, "wb") as f:
+                joblib.dump( extracted_results, f)
+    else:
+        with open( extracted_file_path, "rb") as f:
+            extracted_results = joblib.load(f)
+    print( f"total time: {time.time() - st}")
+    return extracted_results
+def test_get_analysis_results():
+    data = pd.read_csv("data/餐廳類型分類.xlsx - 測試清單.csv")
+    analysis_results, empty_indices = extract_results( data )
+def postprocess_result( results: pd.DataFrame, postprocessed_results_path, category_hierarchy: dict, column_name: str = 'category'):
+    """
+    Argument
+        analysis_result: `evidence`, `result`
+        postprocessed_results_path
+    Return
+    """
+    # index = analysis_result['result']['index']
+    # store_name = data.loc[index]['營業人名稱'] if len(analysis_result['result'].get('store_name',''))==0 else analysis_result['result']['store_name']
+    # address = data.loc[index]['營業地址'] if len(analysis_result['result'].get('address',''))==0 else analysis_result['result']['address']
+    # post_res = {
+    #     "evidence": analysis_result['evidence'],
+    #     "index": index,
+    #     "begin_date": data.loc[index]['設立日期'],
+    #     "store_name": store_name,
+    #     "address": address,
+    #     "description": analysis_result['result'].get('description', ""),
+    #     "phone_number": analysis_result['result'].get('phone_number', ""),
+    #     "category": analysis_result['result'].get('category', ""),
+    #     "supercategory": category_hierarchy.get(analysis_result['result'].get('category', ""), analysis_result['result'].get('category',"")),
+    # }
+    if not os.path.exists(postprocessed_results_path):
+        postprocessed_results = results.copy()
+        postprocessed_results['supercategory'] = postprocessed_results[column_name].apply(lambda x: category_hierarchy.get(x, ''))
+        with open( postprocessed_results_path, "wb") as f:
+            joblib.dump( postprocessed_results, f)
+    else:
+        with open( postprocessed_results_path, "rb") as f:
+            postprocessed_results = joblib.load(f)
+    return postprocessed_results
+def test_postprocess_result():
+    analysis_result = ""
+    pos_res = postprocess_result( analysis_result)
+def combine_results( results: pd.DataFrame, combined_results_path: str, src_column: str = 'classified_category', tgt_column: str = 'category', strategy: str = 'replace'):
+    """
+    Argument
+        classified_results_df: dataframe
+        combined_results_path
+        src_column: str
+        strategy: str, 'replace' or 'patch'
+    Return
+        combined_results: dataframe
+    """
+    if not os.path.exists(combined_results_path):
+        combined_results = results.copy()
+        if strategy == 'replace':
+            condition = (combined_results[tgt_column]=='') | (combined_results[src_column]!=combined_results[tgt_column])
+            combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values
+        elif strategy == 'patch':
+            condition = (combined_results[tgt_column]=='')
+            combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values
+        else:
+            raise Exception(f"Strategy {strategy} not implemented")
+        with open( combined_results_path, "wb") as f:
+            joblib.dump( combined_results, f)
+    else:
+        with open( combined_results_path, "rb") as f:
+            combined_results = joblib.load(f)
+    return combined_results
+def format_evidence(evidence):
+    """
+    """
+    formatted = []
+    evidence = json.loads(evidence)
+    # print( len(evidence) )
+    for i in range(len(evidence)):
+        if 'title' in evidence[i] and '顧客評價' in evidence[i]:
+            f = f"\n> 顧客評價： {evidence[i]['顧客評價']}"
+        elif 'title' in evidence[i] and evidence[i]['title']=='類似的店':
+            f = f"\n> 類似的店： {evidence[i]['snippet']}"
+        elif 'status' in evidence[i]:
+            f = f"\n> 經營狀態： {evidence[i]['status']}"
+        elif 'telephone_number' in evidence[i]:
+            f = f"\n> 電話號碼： {evidence[i]['telephone_number']}"
+        else:
+            try:
+                f = f"{i+1}. {evidence[i]['title']} ({evidence[i].get('snippet','')})"
+            except KeyError:
+                print( evidence[i] )
+                raise KeyError
+        formatted.append(f)
+    return "\n".join(formatted)
+def format_output( df: pd.DataFrame, input_column: str = 'evidence', output_column: str = 'formatted_evidence', format_func = format_evidence):
+    """
+    Argument
+        df: `evidence`, `result`
+        input_column:
+        output_column:
+        format_func:
+    Return
+        formatted_df: dataframe of `formatted_evidence`
+    """
+    formatted_df = df.copy()
+    formatted_df[output_column] = formatted_df[input_column].apply(format_evidence)
+    return formatted_df
+def merge_results( results: list, dataframe_columns: list, list_columns: list):
+    """
+    Argument
+        results: a list of dataframes
+        dataframe_columns: list
+        list_columns: list
+    """
+    assert len(results) > 0, "No results to merge"
+    merged_results = {}
+    for result in results:
+        for key in dataframe_columns:
+            mer_res = pd.concat([ r[key] for r in results], ignore_index=True)
+            merged_results[key] = mer_res
+        for key in list_columns:
+            mer_res = list(itertools.chain(*[ r[key] for r in results]))
+            merged_results[key] = mer_res
+    return merged_results
+def split_dataframe( df: pd.DataFrame, n_processes: int = 4) -> list:
+    """
+    """
+    n = df.shape[0]
+    n_per_process = math.ceil(n / n_processes)
+    return [ df.iloc[i:i+n_per_process] for i in range(0, n, n_per_process)]
+def main(args):
+    """
+    Argument
+        args: argparse
+    """
+    ## 讀取資料名單 ##
+    data = get_leads(args.data_path)
+    ## 進行爬蟲與分析 ##
+    # crawled_results = crawl_results(data)
+    crawled_results = crawl_results_mp( data, args.crawled_file_path, n_processes=args.n_processes)
+    ## 方法 1: 擷取關鍵資訊與分類 ##
+    # extracted_results = extract_results(
+    #     crawled_results['crawled_results']
+    # )
+    extracted_results = extract_results_mp(
+        crawled_results = crawled_results['crawled_results'],
+        extracted_file_path = args.extracted_file_path
+    )
+    ## 方法2: 直接對爬蟲結果分類 ##
+    # classified_results = classify_results(
+    #     extracted_results['extracted_results'],
+    #     input_column = 'evidence',
+    #     output_column = 'classified_category',
+    #     classes = ['中式', '西式'],
+    #     backup_classes = [ '中式', '西式']
+    # )
+    classified_results = classify_results_mp(
+        extracted_results['extracted_results'],
+        args.classified_file_path,
+        classes=args.classes,
+        backup_classes=args.backup_classes,
+        n_processes=args.n_processes
+    )
+    ## 合併分析結果 ##
+    combined_results = combine_results(
+        classified_results['classified_results'],
+        args.combined_file_path,
+        src_column='classified_category',
+        tgt_column='category',
+        strategy='replace'
+    )
+    ## 後處理分析結果 ##
+    postprossed_results = postprocess_result(
+        combined_results,
+        args.postprocessed_results,
+        category2supercategory
+    )
+    formatted_results = format_output( postprossed_results, input_column = 'evidence', output_column = 'formatted_evidence', format_func = format_evidence)
+    formatted_results.to_csv("data/formatted_results.csv", index=False)
+category2supercategory = {
+        "小吃店": "中式",
+        "日式料理(含居酒屋，串燒)": "中式",
+        "火(鍋／爐)": "中式",
+        "東南亞料理(不含日韓)": "中式",
+        "海鮮熱炒": "中式",
+        "特色餐廳(含雞、鵝、牛、羊肉)": "中式",
+        "傳統餐廳": "中式",
+        "燒烤": "中式",
+        "韓式料理(含火鍋，烤肉)": "中式",
+        "西餐廳(含美式，義式，墨式)": "西式",
+        "中式": "中式",
+        "西式": "西式"
+    }
+supercategory2category = {
+        "中式": [
+            "小吃店",
+            "日式料理(含居酒屋，串燒)",
+            "火(鍋／爐)",
+            "東南亞料理(不含日韓)",
+            "海鮮熱炒",
+            "特色餐廳(含雞、鵝、牛、羊肉)",
+            "傳統餐廳",
+            "燒烤",
+            "韓式料理(含火鍋，烤肉)"
+        ],
+        "西式": ["西餐廳(含美式，義式，墨式)"]
+    }
+if __name__=='__main__':
+    base = "https://serpapi.com/search.json"
+    engine = 'google'
+    # query = "Coffee"
+    google_domain = 'google.com.tw'
+    gl = 'tw'
+    lr = 'lang_zh-TW'
+    # url = f"{base}?engine={engine}&q={query}&google_domain={google_domain}&gl={gl}&lr={lr}"
+    n_processes = 4
+    client = OpenAI( organization = ORGANIZATION_ID)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_path", type=str, default="data/餐廳類型分類.xlsx - 測試清單.csv")
+    parser.add_argument("--classified_file_path", type=str, default="data/classified_results.joblib")
+    parser.add_argument("--extracted_file_path", type=str, default="data/extracted_results.joblib")
+    parser.add_argument("--crawled_file_path", type=str, default="data/crawled_results.joblib")
+    parser.add_argument("--combined_file_path", type=str, default="data/combined_results.joblib")
+    parser.add_argument("--postprocessed_results", type=str, default="data/postprocessed_results.joblib")
+    parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋，串燒)', '火(鍋／爐)', '東南亞料理(不含日韓)', '海鮮熱炒',  '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋，烤肉)', '西餐廳(含美式，義式，墨式)'])
+    parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
+    parser.add_argument("--n_processes", type=int, default=4)
+    args = parser.parse_args()
+    main(args)