Spaces:
Runtime error
Runtime error
ray commited on
Commit ·
9021b39
1
Parent(s): 61ec090
v2 - manually split knowledge units
Browse files- .gitignore +4 -0
- app.py +13 -6
- chatbot.py +8 -8
- custom_io.py +45 -0
- scripts/convert_docx_to_md.sh +37 -0
.gitignore
CHANGED
|
@@ -1,3 +1,7 @@
|
|
| 1 |
.env
|
| 2 |
**/__pycache__
|
| 3 |
awesumcare_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
.env
|
| 2 |
**/__pycache__
|
| 3 |
awesumcare_data
|
| 4 |
+
TestData
|
| 5 |
+
logs
|
| 6 |
+
wandb
|
| 7 |
+
streamlit_chatbot_pack
|
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import openai
|
| 3 |
import os
|
|
@@ -14,7 +15,7 @@ from llama_index.ingestion import IngestionPipeline
|
|
| 14 |
from chat_template import CHAT_TEXT_QA_PROMPT
|
| 15 |
from schemas import ChatbotVersion, ServiceProvider
|
| 16 |
from chatbot import Chatbot, IndexBuilder
|
| 17 |
-
from custom_io import UnstructuredReader, default_file_metadata_func
|
| 18 |
from qdrant import client as qdrantClient
|
| 19 |
from llama_index import set_global_service_context
|
| 20 |
|
|
@@ -28,11 +29,11 @@ llama_index.set_global_handler("arize_phoenix")
|
|
| 28 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 29 |
|
| 30 |
IS_LOAD_FROM_VECTOR_STORE = True
|
| 31 |
-
VDB_COLLECTION_NAME = "demo-
|
| 32 |
MODEL_NAME = ChatbotVersion.CHATGPT_4.value
|
| 33 |
|
| 34 |
|
| 35 |
-
CHUNK_SIZE =
|
| 36 |
LLM, EMBED_MODEL = get_service_provider_config(
|
| 37 |
service_provider=ServiceProvider.OPENAI, model_name=MODEL_NAME)
|
| 38 |
service_context = ServiceContext.from_defaults(
|
|
@@ -45,13 +46,19 @@ set_global_service_context(service_context)
|
|
| 45 |
|
| 46 |
class AwesumIndexBuilder(IndexBuilder):
|
| 47 |
def _load_doucments(self):
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
".pdf": UnstructuredReader(),
|
| 50 |
".docx": UnstructuredReader(),
|
| 51 |
".pptx": UnstructuredReader(),
|
|
|
|
| 52 |
},
|
| 53 |
recursive=True,
|
| 54 |
-
|
|
|
|
| 55 |
file_metadata=default_file_metadata_func)
|
| 56 |
|
| 57 |
self.documents = dir_reader.load_data()
|
|
@@ -73,7 +80,7 @@ class AwesumIndexBuilder(IndexBuilder):
|
|
| 73 |
return
|
| 74 |
pipeline = IngestionPipeline(
|
| 75 |
transformations=[
|
| 76 |
-
SentenceSplitter(),
|
| 77 |
self.embed_model,
|
| 78 |
],
|
| 79 |
vector_store=self.vector_store,
|
|
|
|
| 1 |
+
import glob
|
| 2 |
import gradio as gr
|
| 3 |
import openai
|
| 4 |
import os
|
|
|
|
| 15 |
from chat_template import CHAT_TEXT_QA_PROMPT
|
| 16 |
from schemas import ChatbotVersion, ServiceProvider
|
| 17 |
from chatbot import Chatbot, IndexBuilder
|
| 18 |
+
from custom_io import MarkdownReader, UnstructuredReader, default_file_metadata_func
|
| 19 |
from qdrant import client as qdrantClient
|
| 20 |
from llama_index import set_global_service_context
|
| 21 |
|
|
|
|
| 29 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 30 |
|
| 31 |
IS_LOAD_FROM_VECTOR_STORE = True
|
| 32 |
+
VDB_COLLECTION_NAME = "demo-v1"
|
| 33 |
MODEL_NAME = ChatbotVersion.CHATGPT_4.value
|
| 34 |
|
| 35 |
|
| 36 |
+
CHUNK_SIZE = 8191
|
| 37 |
LLM, EMBED_MODEL = get_service_provider_config(
|
| 38 |
service_provider=ServiceProvider.OPENAI, model_name=MODEL_NAME)
|
| 39 |
service_context = ServiceContext.from_defaults(
|
|
|
|
| 46 |
|
| 47 |
class AwesumIndexBuilder(IndexBuilder):
|
| 48 |
def _load_doucments(self):
|
| 49 |
+
directory = "./awesumcare_data/awesumcare_manual_data"
|
| 50 |
+
# all_files = glob.glob(os.path.join(directory, '*.md'))
|
| 51 |
+
# faq_files = [f for f in all_files if 'FAQ' in os.path.basename(f)]
|
| 52 |
+
# print(faq_files)
|
| 53 |
+
dir_reader = SimpleDirectoryReader(directory, file_extractor={
|
| 54 |
".pdf": UnstructuredReader(),
|
| 55 |
".docx": UnstructuredReader(),
|
| 56 |
".pptx": UnstructuredReader(),
|
| 57 |
+
".md": MarkdownReader()
|
| 58 |
},
|
| 59 |
recursive=True,
|
| 60 |
+
# input_files=faq_files,
|
| 61 |
+
exclude=["*.png", "*.pptx", "*.docx", "*.pdf"],
|
| 62 |
file_metadata=default_file_metadata_func)
|
| 63 |
|
| 64 |
self.documents = dir_reader.load_data()
|
|
|
|
| 80 |
return
|
| 81 |
pipeline = IngestionPipeline(
|
| 82 |
transformations=[
|
| 83 |
+
# SentenceSplitter(),
|
| 84 |
self.embed_model,
|
| 85 |
],
|
| 86 |
vector_store=self.vector_store,
|
chatbot.py
CHANGED
|
@@ -126,14 +126,14 @@ class Chatbot:
|
|
| 126 |
partial_message += token
|
| 127 |
yield partial_message
|
| 128 |
|
| 129 |
-
urls = [source.node.metadata.get(
|
| 130 |
-
|
| 131 |
-
if urls:
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
|
| 138 |
def convert_to_chat_messages(self, history: List[List[str]]) -> List[ChatMessage]:
|
| 139 |
chat_messages = [ChatMessage(
|
|
|
|
| 126 |
partial_message += token
|
| 127 |
yield partial_message
|
| 128 |
|
| 129 |
+
# urls = [source.node.metadata.get(
|
| 130 |
+
# "file_name") for source in response.source_nodes if source.score >= 0.78 and source.node.metadata.get("file_name")]
|
| 131 |
+
# if urls:
|
| 132 |
+
# urls = list(set(urls))
|
| 133 |
+
# url_section = "\n \n\n---\n\n參考: \n" + \
|
| 134 |
+
# "\n".join(f"- {url}" for url in urls)
|
| 135 |
+
# partial_message += url_section
|
| 136 |
+
# yield partial_message
|
| 137 |
|
| 138 |
def convert_to_chat_messages(self, history: List[List[str]]) -> List[ChatMessage]:
|
| 139 |
chat_messages = [ChatMessage(
|
custom_io.py
CHANGED
|
@@ -50,6 +50,51 @@ class UnstructuredReader(BaseReader):
|
|
| 50 |
]
|
| 51 |
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
def default_file_metadata_func(file_path: str) -> Dict:
|
| 54 |
"""Get some handy metadate from filesystem.
|
| 55 |
|
|
|
|
| 50 |
]
|
| 51 |
|
| 52 |
|
| 53 |
+
class MarkdownReader(BaseReader):
|
| 54 |
+
"""General unstructured text reader for a variety of files."""
|
| 55 |
+
|
| 56 |
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
| 57 |
+
"""Init params."""
|
| 58 |
+
super().__init__(*args, **kwargs)
|
| 59 |
+
|
| 60 |
+
def load_data(
|
| 61 |
+
self,
|
| 62 |
+
file: Path,
|
| 63 |
+
extra_info: Optional[Dict] = None,
|
| 64 |
+
split_documents: Optional[bool] = True,
|
| 65 |
+
) -> List[Document]:
|
| 66 |
+
"""Parse file."""
|
| 67 |
+
from unstructured.partition.auto import partition
|
| 68 |
+
|
| 69 |
+
elements = parse_knowledge_units(str(file))
|
| 70 |
+
|
| 71 |
+
if split_documents:
|
| 72 |
+
return [
|
| 73 |
+
Document(text=ele, extra_info=extra_info or {})
|
| 74 |
+
for ele in elements
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
def parse_knowledge_units(file_path):
|
| 78 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 79 |
+
lines = file.readlines()
|
| 80 |
+
|
| 81 |
+
knowledge_units = []
|
| 82 |
+
current_unit = ""
|
| 83 |
+
|
| 84 |
+
for line in lines:
|
| 85 |
+
if line.strip() and line[0].isdigit() and '.' in line:
|
| 86 |
+
if current_unit:
|
| 87 |
+
knowledge_units.append(current_unit.strip())
|
| 88 |
+
current_unit = ""
|
| 89 |
+
current_unit += line
|
| 90 |
+
else:
|
| 91 |
+
current_unit += line
|
| 92 |
+
|
| 93 |
+
if current_unit:
|
| 94 |
+
knowledge_units.append(current_unit.strip())
|
| 95 |
+
|
| 96 |
+
return knowledge_units
|
| 97 |
+
|
| 98 |
def default_file_metadata_func(file_path: str) -> Dict:
|
| 99 |
"""Get some handy metadate from filesystem.
|
| 100 |
|
scripts/convert_docx_to_md.sh
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Check if a directory path is provided
|
| 4 |
+
if [ "$#" -ne 1 ]; then
|
| 5 |
+
echo "Usage: $0 <directory_path>"
|
| 6 |
+
exit 1
|
| 7 |
+
fi
|
| 8 |
+
|
| 9 |
+
# Get the directory path from the argument
|
| 10 |
+
dir_path=$1
|
| 11 |
+
|
| 12 |
+
# Check if the specified directory exists
|
| 13 |
+
if [ ! -d "$dir_path" ]; then
|
| 14 |
+
echo "Directory does not exist: $dir_path"
|
| 15 |
+
exit 1
|
| 16 |
+
fi
|
| 17 |
+
|
| 18 |
+
# Iterate through all .docx files in the specified directory
|
| 19 |
+
for docx_file in "$dir_path"/*.docx; do
|
| 20 |
+
# Skip if no .docx files are found
|
| 21 |
+
if [ ! -f "$docx_file" ]; then
|
| 22 |
+
continue
|
| 23 |
+
fi
|
| 24 |
+
|
| 25 |
+
# Extract filename without extension
|
| 26 |
+
filename=$(basename -- "$docx_file")
|
| 27 |
+
filename="${filename%.*}"
|
| 28 |
+
|
| 29 |
+
# Define the output Markdown filename
|
| 30 |
+
md_file="${dir_path}/${filename}.md"
|
| 31 |
+
|
| 32 |
+
# Convert the document to Markdown format
|
| 33 |
+
pandoc -t markdown --extract-media="$dir_path" "$docx_file" -o "$md_file"
|
| 34 |
+
echo "Converted: $docx_file to $md_file"
|
| 35 |
+
done
|
| 36 |
+
|
| 37 |
+
echo "Conversion complete."
|