Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import pathlib | |
| import whoosh | |
| import whoosh.index | |
| import whoosh.query | |
| import os | |
| from datetime import date as Date | |
| import re | |
| DATA_FOLDER = pathlib.Path(__file__).parent / "Data" | |
| RAW_FOLDER = DATA_FOLDER / "Transcription_raw" | |
| INDEX_FOLDER = DATA_FOLDER / "Transcription_index" | |
| class Searcher: | |
| def __init__(self): | |
| self.ix = self.make_total_ix() | |
| self.df_video_links = self.get_video_links() | |
| def make_total_ix(self): | |
| ixes_sub = [] | |
| index_dir_list = os.listdir(INDEX_FOLDER) | |
| for name in index_dir_list: | |
| if name.startswith("sub"): | |
| ixes_sub.append(whoosh.index.open_dir(INDEX_FOLDER / name)) | |
| ix = MultiIndexSearcher(ixes_sub) | |
| return ix | |
| def search(self, date_start, date_end, **kwargs): | |
| titles = self.ix.search(**kwargs) | |
| #(index, date, title) | |
| contents = [] | |
| for title in titles: | |
| index = int(title.split("m")[0]) | |
| row = self.df_video_links.iloc[index] | |
| title = row["title"] | |
| date = row["date"] | |
| date_datetime = Date(*map(int, date.split("/"))) | |
| if not (date_start <= date_datetime <= date_end): | |
| continue | |
| contents.append((date_datetime, index, date, title)) | |
| #order by date_datetime | |
| contents.sort() | |
| #remove date_datetime | |
| contents = [(index, date, title) for _, index, date, title in contents] | |
| return contents | |
| def get_video_links(self): | |
| return pd.read_csv(DATA_FOLDER / "video_links.csv", index_col=0) | |
| def get_content(self, index): | |
| #正規表現でRAW_FOLDERから「index-(数字).csv」のファイルを取得 | |
| folder_list = os.listdir(RAW_FOLDER) | |
| pattern = re.compile(r"{}-\d+.csv".format(index)) | |
| matched = [name for name in folder_list if pattern.match(name)] | |
| matched.sort() | |
| best = matched[-1] | |
| return pd.read_csv(RAW_FOLDER / best) | |
| class MultiIndexSearcher: | |
| def __init__(self, ixes): | |
| self.ixes = ixes | |
| def search(self, **kwargs): | |
| titles = [] | |
| for ix in self.ixes: | |
| with ix.searcher() as sub_searcher: | |
| hits = sub_searcher.search(**kwargs, limit = None) | |
| for hit in hits: | |
| titles.append(hit["title"]) | |
| return titles | |
| searcher = Searcher() | |
| def main(): | |
| global searcher | |
| st.title("KATO DB") | |
| keyword = st.text_input( | |
| "検索したいキーワードを入力して、Enterを押してください\n"\ | |
| "空欄だと全文書表示します。\n"\ | |
| "検索にヒットしない場合書き起こしAIに認識されていない可能性があります。(「もこう」など)" | |
| ) | |
| date_start = st.date_input( | |
| "検索したい開始日付を入力してください", | |
| Date(2009, 1, 1) | |
| ) | |
| date_end = st.date_input( | |
| "検索したい終了日付を入力してください", | |
| Date(2050, 12, 31) | |
| ) | |
| #make query | |
| if keyword == "": | |
| query = whoosh.query.Every() | |
| else: | |
| #AND search | |
| keyword_list = keyword.split() | |
| query = whoosh.query.And([whoosh.query.Term("content", word) for word in keyword_list]) | |
| contents = searcher.search(q = query, date_start=date_start, date_end=date_end) | |
| st.write("該当件数:{}件".format(len(contents))) | |
| results = pd.DataFrame(contents, columns=["管理番号", "放送日", "動画タイトル"]) | |
| st.dataframe(results, hide_index=True) | |
| selected_index = st.selectbox("管理番号を選択して、Enterを押して書き起こしを表示shi", results["管理番号"]) | |
| if selected_index is not None: | |
| df_transcription = searcher.get_content(selected_index) | |
| st.dataframe(df_transcription, width=1000) | |
| st.write("ダウンロードはこちらから:https://github.com/konbraphat51/kato_db_dataset") | |
| st.write("データ収集のプログラムコード・仕組みの説明はこちらから:https://github.com/konbraphat51/kato_db") | |
| st.write("加藤AIのプロトタイプ:https://colab.research.google.com/drive/1QsJN50wvLEJx04P4XaBtsKqx1q3532OU?usp=sharing") | |
| st.write("ブログ(協力者募集しています):https://qiita.com/konbraphat51/items/5b27afda442c13806c25") | |
| st.write("データ最終更新:2023/8/19") | |
| if __name__ == "__main__": | |
| main() | |