Spaces:
Running
Running
Refactor app.py and dataset.py: Update comments for clarity, enhance download_vector_store method with improved error handling, and streamline vector store download process.
Browse files- app.py +24 -15
- src/knowledge_base/dataset.py +29 -88
app.py
CHANGED
|
@@ -995,7 +995,7 @@ def initialize_app():
|
|
| 995 |
token=HF_TOKEN
|
| 996 |
)
|
| 997 |
|
| 998 |
-
#
|
| 999 |
system_prompt_text = DEFAULT_SYSTEM_MESSAGE
|
| 1000 |
if "system_prompt" in preferences and "current" in preferences["system_prompt"]:
|
| 1001 |
system_prompt_text = preferences["system_prompt"]["current"]
|
|
@@ -1012,7 +1012,7 @@ def initialize_chat_evaluator():
|
|
| 1012 |
dataset_id=DATASET_ID
|
| 1013 |
)
|
| 1014 |
|
| 1015 |
-
#
|
| 1016 |
os.makedirs(DATASET_CHAT_HISTORY_PATH, exist_ok=True)
|
| 1017 |
os.makedirs(os.path.join(DATASET_ANNOTATIONS_PATH), exist_ok=True)
|
| 1018 |
|
|
@@ -1180,7 +1180,7 @@ with gr.Blocks(css="""
|
|
| 1180 |
Please create a knowledge base using the buttons on the left.
|
| 1181 |
"""
|
| 1182 |
|
| 1183 |
-
#
|
| 1184 |
doc_count = len(vector_store.docstore._dict)
|
| 1185 |
sources = set()
|
| 1186 |
|
|
@@ -1651,33 +1651,42 @@ def get_selected_urls(sources_df):
|
|
| 1651 |
logger.error(f"Error getting selected URLs: {str(e)}")
|
| 1652 |
return []
|
| 1653 |
|
| 1654 |
-
def update_kb_with_selected(sources_df):
|
| 1655 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1656 |
try:
|
| 1657 |
-
|
|
|
|
| 1658 |
|
| 1659 |
if not selected_urls:
|
| 1660 |
-
return "Error: No
|
|
|
|
|
|
|
|
|
|
| 1661 |
|
| 1662 |
-
#
|
| 1663 |
-
from config import constants
|
| 1664 |
-
original_urls = constants.URLS
|
| 1665 |
constants.URLS = selected_urls
|
| 1666 |
|
| 1667 |
try:
|
| 1668 |
-
#
|
| 1669 |
success, message = create_vector_store(mode="update")
|
| 1670 |
|
| 1671 |
-
# Сохраняем метаданные с информацией о выбранных URL
|
| 1672 |
if success:
|
| 1673 |
-
#
|
| 1674 |
metadata = {
|
| 1675 |
"last_updated": datetime.datetime.now().isoformat(),
|
| 1676 |
"source_count": len(selected_urls),
|
| 1677 |
"sources": selected_urls
|
| 1678 |
}
|
| 1679 |
|
| 1680 |
-
#
|
| 1681 |
json_content = json.dumps(metadata, indent=2).encode('utf-8')
|
| 1682 |
api = HfApi(token=HF_TOKEN)
|
| 1683 |
|
|
@@ -1690,7 +1699,7 @@ def update_kb_with_selected(sources_df):
|
|
| 1690 |
|
| 1691 |
return message
|
| 1692 |
finally:
|
| 1693 |
-
#
|
| 1694 |
constants.URLS = original_urls
|
| 1695 |
|
| 1696 |
except Exception as e:
|
|
|
|
| 995 |
token=HF_TOKEN
|
| 996 |
)
|
| 997 |
|
| 998 |
+
# Load saved system prompt from preferences or use DEFAULT_SYSTEM_MESSAGE
|
| 999 |
system_prompt_text = DEFAULT_SYSTEM_MESSAGE
|
| 1000 |
if "system_prompt" in preferences and "current" in preferences["system_prompt"]:
|
| 1001 |
system_prompt_text = preferences["system_prompt"]["current"]
|
|
|
|
| 1012 |
dataset_id=DATASET_ID
|
| 1013 |
)
|
| 1014 |
|
| 1015 |
+
# Check if directories exist
|
| 1016 |
os.makedirs(DATASET_CHAT_HISTORY_PATH, exist_ok=True)
|
| 1017 |
os.makedirs(os.path.join(DATASET_ANNOTATIONS_PATH), exist_ok=True)
|
| 1018 |
|
|
|
|
| 1180 |
Please create a knowledge base using the buttons on the left.
|
| 1181 |
"""
|
| 1182 |
|
| 1183 |
+
# Get information about vector store
|
| 1184 |
doc_count = len(vector_store.docstore._dict)
|
| 1185 |
sources = set()
|
| 1186 |
|
|
|
|
| 1651 |
logger.error(f"Error getting selected URLs: {str(e)}")
|
| 1652 |
return []
|
| 1653 |
|
| 1654 |
+
def update_kb_with_selected(sources_df) -> str:
|
| 1655 |
+
"""
|
| 1656 |
+
Updates knowledge base with selected sources.
|
| 1657 |
+
|
| 1658 |
+
Args:
|
| 1659 |
+
sources_df: Dataframe containing sources and their selection status
|
| 1660 |
+
|
| 1661 |
+
Returns:
|
| 1662 |
+
str: Status message
|
| 1663 |
+
"""
|
| 1664 |
try:
|
| 1665 |
+
# Filter selected URLs
|
| 1666 |
+
selected_urls = sources_df[sources_df['Include']]['URL'].tolist()
|
| 1667 |
|
| 1668 |
if not selected_urls:
|
| 1669 |
+
return "Error: No sources selected"
|
| 1670 |
+
|
| 1671 |
+
# Store original URLs
|
| 1672 |
+
original_urls = URLS.copy()
|
| 1673 |
|
| 1674 |
+
# Update URLS with selected ones
|
|
|
|
|
|
|
| 1675 |
constants.URLS = selected_urls
|
| 1676 |
|
| 1677 |
try:
|
| 1678 |
+
# Update knowledge base
|
| 1679 |
success, message = create_vector_store(mode="update")
|
| 1680 |
|
|
|
|
| 1681 |
if success:
|
| 1682 |
+
# Create metadata with current date and selected URLs
|
| 1683 |
metadata = {
|
| 1684 |
"last_updated": datetime.datetime.now().isoformat(),
|
| 1685 |
"source_count": len(selected_urls),
|
| 1686 |
"sources": selected_urls
|
| 1687 |
}
|
| 1688 |
|
| 1689 |
+
# Save to dataset
|
| 1690 |
json_content = json.dumps(metadata, indent=2).encode('utf-8')
|
| 1691 |
api = HfApi(token=HF_TOKEN)
|
| 1692 |
|
|
|
|
| 1699 |
|
| 1700 |
return message
|
| 1701 |
finally:
|
| 1702 |
+
# Restore original URLs
|
| 1703 |
constants.URLS = original_urls
|
| 1704 |
|
| 1705 |
except Exception as e:
|
src/knowledge_base/dataset.py
CHANGED
|
@@ -40,106 +40,47 @@ class DatasetManager:
|
|
| 40 |
|
| 41 |
# Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
|
| 42 |
|
| 43 |
-
def download_vector_store(self):
|
| 44 |
"""
|
| 45 |
-
|
| 46 |
|
| 47 |
Returns:
|
| 48 |
-
tuple: (success, result)
|
| 49 |
"""
|
| 50 |
try:
|
| 51 |
-
|
| 52 |
-
import shutil
|
| 53 |
-
from langchain.vectorstores import FAISS
|
| 54 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
| 55 |
-
from config.settings import EMBEDDING_MODEL, DATASET_VECTOR_STORE_PATH
|
| 56 |
-
|
| 57 |
-
logger.info(f"Attempting to download vector store from dataset {self.dataset_id}")
|
| 58 |
-
|
| 59 |
-
# Создаем временную директорию для скачивания
|
| 60 |
temp_dir = tempfile.mkdtemp()
|
| 61 |
logger.debug(f"Created temporary directory at {temp_dir}")
|
| 62 |
|
| 63 |
try:
|
| 64 |
-
#
|
| 65 |
-
api
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
-
# Проверяем наличие файлов индекса в датасете
|
| 68 |
-
try:
|
| 69 |
-
files = api.list_repo_files(
|
| 70 |
-
repo_id=self.dataset_id,
|
| 71 |
-
repo_type="dataset"
|
| 72 |
-
)
|
| 73 |
-
|
| 74 |
-
# Ищем файлы векторного хранилища
|
| 75 |
-
vector_store_files = [f for f in files if f.startswith(f"{DATASET_VECTOR_STORE_PATH}/")]
|
| 76 |
-
|
| 77 |
-
if not vector_store_files:
|
| 78 |
-
logger.warning(f"No vector store files found in dataset {self.dataset_id}")
|
| 79 |
-
return False, "Vector store not found in dataset"
|
| 80 |
-
|
| 81 |
-
# Создаем папку для скачивания
|
| 82 |
-
vector_store_dir = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
|
| 83 |
-
os.makedirs(vector_store_dir, exist_ok=True)
|
| 84 |
-
|
| 85 |
-
# Скачиваем все файлы
|
| 86 |
-
for file in vector_store_files:
|
| 87 |
-
# Получаем имя файла без пути
|
| 88 |
-
filename = os.path.basename(file)
|
| 89 |
-
# Скачиваем файл
|
| 90 |
-
api.hf_hub_download(
|
| 91 |
-
repo_id=self.dataset_id,
|
| 92 |
-
repo_type="dataset",
|
| 93 |
-
filename=file,
|
| 94 |
-
local_dir=temp_dir,
|
| 95 |
-
local_dir_use_symlinks=False
|
| 96 |
-
)
|
| 97 |
-
logger.debug(f"Downloaded {file}")
|
| 98 |
-
|
| 99 |
-
# Инициализируем embeddings
|
| 100 |
-
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
|
| 101 |
-
|
| 102 |
-
# Загружаем FAISS из скачанных файлов
|
| 103 |
-
try:
|
| 104 |
-
# Путь к директории с файлами FAISS
|
| 105 |
-
faiss_path = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
|
| 106 |
-
|
| 107 |
-
# Проверяем наличие необходимых файлов
|
| 108 |
-
if not os.path.exists(os.path.join(faiss_path, "index.faiss")):
|
| 109 |
-
logger.error(f"Missing FAISS index file at {faiss_path}")
|
| 110 |
-
return False, "Missing FAISS index file"
|
| 111 |
-
|
| 112 |
-
if not os.path.exists(os.path.join(faiss_path, "index.pkl")):
|
| 113 |
-
logger.error(f"Missing FAISS pickle file at {faiss_path}")
|
| 114 |
-
return False, "Missing FAISS pickle file"
|
| 115 |
-
|
| 116 |
-
# Загружаем FAISS из директории
|
| 117 |
-
faiss_index = FAISS.load_local(faiss_path, embeddings)
|
| 118 |
-
logger.info(f"Successfully loaded FAISS index with {len(faiss_index.docstore._dict)} documents")
|
| 119 |
-
|
| 120 |
-
return True, faiss_index
|
| 121 |
-
|
| 122 |
-
except Exception as e:
|
| 123 |
-
logger.error(f"Error loading FAISS index: {str(e)}")
|
| 124 |
-
return False, f"Error loading FAISS index: {str(e)}"
|
| 125 |
-
|
| 126 |
-
except Exception as e:
|
| 127 |
-
logger.error(f"Error listing files in dataset {self.dataset_id}: {str(e)}")
|
| 128 |
-
return False, f"Error accessing dataset: {str(e)}"
|
| 129 |
-
|
| 130 |
finally:
|
| 131 |
-
#
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
logger.debug(f"Cleaned up temporary directory {temp_dir}")
|
| 135 |
-
except Exception as e:
|
| 136 |
-
logger.warning(f"Error cleaning up temporary directory {temp_dir}: {str(e)}")
|
| 137 |
-
|
| 138 |
except Exception as e:
|
| 139 |
-
logger.error(f"
|
| 140 |
-
|
| 141 |
-
logger.error(traceback.format_exc())
|
| 142 |
-
return False, f"Error downloading vector store: {str(e)}"
|
| 143 |
|
| 144 |
def get_last_update_date(self):
|
| 145 |
"""
|
|
|
|
| 40 |
|
| 41 |
# Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
|
| 42 |
|
| 43 |
+
def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
|
| 44 |
"""
|
| 45 |
+
Downloads vector store from dataset.
|
| 46 |
|
| 47 |
Returns:
|
| 48 |
+
tuple: (success, result) where result is either FAISS object or error message
|
| 49 |
"""
|
| 50 |
try:
|
| 51 |
+
# Create temporary directory for download
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
temp_dir = tempfile.mkdtemp()
|
| 53 |
logger.debug(f"Created temporary directory at {temp_dir}")
|
| 54 |
|
| 55 |
try:
|
| 56 |
+
# Download vector store files
|
| 57 |
+
self.api.snapshot_download(
|
| 58 |
+
repo_id=self.dataset_name,
|
| 59 |
+
repo_type="dataset",
|
| 60 |
+
local_dir=temp_dir,
|
| 61 |
+
allow_patterns=["vector_store/*"]
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# Load vector store
|
| 65 |
+
embeddings = HuggingFaceEmbeddings(
|
| 66 |
+
model_name=EMBEDDING_MODEL,
|
| 67 |
+
model_kwargs={'device': 'cpu'}
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
vector_store = FAISS.load_local(
|
| 71 |
+
os.path.join(temp_dir, "vector_store"),
|
| 72 |
+
embeddings
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
return True, vector_store
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
finally:
|
| 78 |
+
# Clean up temp directory
|
| 79 |
+
shutil.rmtree(temp_dir)
|
| 80 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
except Exception as e:
|
| 82 |
+
logger.error(f"Error downloading vector store: {str(e)}")
|
| 83 |
+
return False, f"Error downloading vector store: {str(e)}"
|
|
|
|
|
|
|
| 84 |
|
| 85 |
def get_last_update_date(self):
|
| 86 |
"""
|