Spaces:

ChenyuRabbitLove
/

junyi_bot_external

Runtime error

App Files Files Community

Chenyu commited on Jul 14, 2023

Commit

f807e7d

1 Parent(s): 175c5c3

Add prod app

Browse files

Files changed (10) hide show

app.py +129 -4
final_result.json +0 -0
requirements.txt +5 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-39.pyc +0 -0
utils/__pycache__/gpt_processor.cpython-39.pyc +0 -0
utils/__pycache__/pdf_processor.cpython-39.pyc +0 -0
utils/docx_processor.py +40 -0
utils/pdf_processor.py +51 -0
utils/work_flow_controller.py +31 -0

app.py CHANGED Viewed

@@ -1,7 +1,132 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

+import json
+import time
+import random
 import gradio as gr
+import pandas as pd
+from utils.gpt_processor import QuestionAnswerer
+qa_processor = QuestionAnswerer()
+current_file = None
+context = None
+with open("final_result.json", 'r', encoding='UTF-8') as fp:
+    db = json.load(fp)
+def read_examples():
+    df = pd.read_csv(r'examples.csv')
+    return [f"我想了解有關於「{keyword}」的文件" for keyword in df['word'].tolist()]
+def user(message, history):
+    #return gr.update(value="", interactive=False), history + [[message, None]]
+    return "", history + [[message, None]]
+def bot(history):
+    user_message = history[-1][0]
+    global current_file
+    global context
+    #check if user input has "我想了解"
+    if "我想了解" in user_message:
+        # get keyword from "「」"
+        keyword = user_message.split("「")[1].split("」")[0]
+        # check if keyword is in db
+        file_list = []
+        for key in db.keys():
+            if keyword in db[key]['keywords']:
+                file_list.append(key)
+        if len(file_list) == 0:
+            response = [
+                [user_message, "Sorry, I can't find any documents about this topic. Please try again."],
+            ]
+        else:
+            bot_message = "以下是我所找到的文件："
+            for file in file_list:
+                bot_message += "\n" + file
+            bot_message += "\n\n" + "請複製貼上想要了解的文件，我會給你該文件的摘要"
+            response = [
+                [user_message, bot_message],
+            ]
+        history = response
+        # history[-1][1] = ""
+        # for character in bot_message:
+        #     history[-1][1] += character
+        #     time.sleep(random.uniform(0.01, 0.05))
+        #     yield history
+        return history
+    # check if user input has a pdf file name
+    if ".pdf" in user_message or ".docx" in user_message:
+        current_file = user_message
+        context = db[current_file]['file_full_content']
+        # check if file name is in db
+        if user_message in db.keys():
+            bot_message = f"文件 {user_message} 的摘要如下："
+            bot_message += "\n\n" + db[user_message]['summarized_content']
+            bot_message += "\n\n" + "可以透過詢問來了解更多這個文件的內容"
+            response = [
+                [user_message, bot_message],
+            ]
+        else:
+            response = [
+                [user_message, "Sorry, I can't find this file. Please try again."],
+            ]
+        history[-1] = response[0]
+        # history[-1][1] = ""
+        # for character in bot_message:
+        #     history[-1][1] += character
+        #     time.sleep(random.uniform(0.01, 0.05))
+        #     yield history
+        return history
+    if context is None:
+        response = [
+            [user_message, "請輸入一個文件名稱或是點選下方的範例"],
+        ]
+        history[-1] = response[0]
+        return history
+    if context is not None:
+        bot_message = qa_processor.answer_question(context, user_message)
+        response = [
+            [user_message, bot_message],
+        ]
+        history[-1] = response[0]
+        return history
+with gr.Blocks() as demo:
+    history = gr.State([])
+    user_question = gr.State("")
+    with gr.Row():
+        gr.HTML('Junyi Academy Chatbot')
+        #status_display = gr.Markdown("Success", elem_id="status_display")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=5):
+            with gr.Row():
+                chatbot = gr.Chatbot()
+            with gr.Row():
+                with gr.Column(scale=12):
+                    user_input = gr.Textbox(
+                        show_label=False,
+                        placeholder="Enter text",
+                        container=False,
+                    )
+                # with gr.Column(min_width=70, scale=1):
+                #     submit_btn = gr.Button("Send")
+                with gr.Column(min_width=70, scale=1):
+                    clear_btn = gr.Button("Clear")
+                response = user_input.submit(user,
+                                  [user_input, chatbot],
+                                  [user_input, chatbot],
+                                  queue=False,
+                                  ).then(bot, chatbot, chatbot)
+                response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
+                clear_btn.click(lambda: None, None, chatbot, queue=False)
+    examples = gr.Examples(examples=read_examples(),
+                           inputs=[user_input])
+if __name__ == "__main__":
+    demo.launch(share=True)

final_result.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

	@@ -0,0 +1,5 @@

+openai
+tiktoken
+opencc
+docx2txt
+PyPDF2

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (170 Bytes). View file

utils/__pycache__/gpt_processor.cpython-39.pyc ADDED Viewed

Binary file (8.69 kB). View file

utils/__pycache__/pdf_processor.cpython-39.pyc ADDED Viewed

Binary file (809 Bytes). View file

utils/docx_processor.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import unicodedata
+import re
+import logging
+import docx2txt
+from gpt_processor import Translator
+class DOCXProcessor:
+    def __init__(self, file_path: str) -> None:
+        self.file_path = file_path
+        self.file_info = {
+            'file_name': self.file_path.split('/')[-1],
+            'file_format': 'DOCX',
+            'file_full_content': '',
+        }
+        self.__build_info()
+    def __build_info(self) -> None:
+        try:
+            text = docx2txt.process(self.file_path)
+            text = unicodedata.normalize("NFKD", text)
+            text = text.replace('\n', ' ').replace('\r', '')
+            text = re.sub(' +', ' ', text)
+            self.file_info['is_chinese'] = self.__is_chinese(text)
+            tranlator = Translator()
+            self.file_info['file_full_content'] = tranlator.translate_to_chinese(text) if not self.file_info['is_chinese'] else text
+        except FileNotFoundError:
+            print(f"File not found: {self.file_path}")
+        except Exception as e:
+            print(f"An error occurred: {str(e)}")
+    def __is_chinese(self, text: str) -> bool:
+        for char in text:
+            if char >= '\u4e00' and char <= '\u9fff':
+                return True
+        return False

utils/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import PyPDF2
+import unicodedata
+import re
+import logging
+from datamodel.data_model import PDFRawData
+from .gpt_processor import Translator
+class PDFProcessor:
+    def __init__(self, file_path: str) -> None:
+        self.file_path = file_path
+        self.file_info = {
+            'file_name': self.file_path.split('/')[-1],
+            'file_format': 'PDF',
+            'total_pages': 0,
+            'file_content': {},
+            'file_full_content': '',
+        }
+        self.__build_info()
+    def __build_info(self) -> None:
+        try:
+            with open(self.file_path, 'rb') as pdf_file:
+                pdf_reader = PyPDF2.PdfReader(pdf_file)
+                pages = len(pdf_reader.pages)
+                self.file_info['total_pages'] = pages
+                for i, page in enumerate(pdf_reader.pages):
+                    text = page.extract_text()
+                    text = unicodedata.normalize("NFKD", text)
+                    text = text.replace('\n', ' ').replace('\r', '')
+                    text = re.sub(' +', ' ', text)
+                    self.file_info['is_chinese'] = self.__is_chinese(text)
+                    temp = {}
+                    logging.info(f"Processing page {i + 1}...")
+                    temp['page_num'] = i + 1
+                    tranlator = Translator()
+                    temp['page_content'] = tranlator.translate_to_chinese(text) if not self.file_info['is_chinese'] else text
+                    self.file_info['file_content'][i + 1] = temp
+                    self.file_info['file_full_content'] = self.file_info['file_full_content'] + temp['page_content']
+        except FileNotFoundError:
+            print(f"File not found: {self.file_path}")
+        except Exception as e:
+            print(f"An error occurred: {str(e)}")
+    def __is_chinese(self, text: str) -> bool:
+        for char in text:
+            if char >= '\u4e00' and char <= '\u9fff':
+                return True
+        return False

utils/work_flow_controller.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import json
+from .pdf_processor import PDFProcessor
+from .gpt_processor import Translator, EmbeddingGenerator, KeywordsGenerator, TopicsGenerator
+processors = {
+    'pdf': PDFProcessor,
+}
+class WorkFlowController():
+    def __init__(self, file_path: str, file_name: str) -> None:
+        # get file raw content
+        self.file_name = file_name
+        file_format = file_path.split('.')[-1]
+        self.file_processor = processors[file_format]
+        self.file_info = self.file_processor(file_path).file_info
+    def process_file(self):
+        # process file content
+        # return processed data
+        if not self.file_info['is_chinese']:
+            translator = Translator()
+            self.file_info[1]['file_content'] = translator.translate_to_chinese(self.file_info[1]['file_content'])
+    # save file_info data to json file
+    def dump_to_json(self) -> None:
+        with open(f'{self.file_name}.json', 'w', encoding='utf-8') as f:
+            json.dump(self.file_info, f, indent=4, ensure_ascii=False)