Spaces:
Running
Running
Enhance knowledge base rebuilding functionality: Add method to rebuild knowledge base from selected URLs, improve error handling, and log actions during vector store download process.
Browse files- app.py +49 -4
- src/knowledge_base/dataset.py +10 -11
app.py
CHANGED
|
@@ -72,6 +72,51 @@ def update_kb_with_selected(sources_df) -> str:
|
|
| 72 |
logger.error(f"Error updating knowledge base: {str(e)}")
|
| 73 |
return f"Error updating knowledge base: {str(e)}"
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# Set seed for consistent results
|
| 76 |
langdetect.DetectorFactory.seed = 0
|
| 77 |
|
|
@@ -1734,17 +1779,15 @@ def rebuild_kb_with_selected(sources_df):
|
|
| 1734 |
return "Error: No URLs selected for inclusion"
|
| 1735 |
|
| 1736 |
# Временно заменяем URLS на выбранные URL
|
| 1737 |
-
|
| 1738 |
-
original_urls = constants.URLS
|
| 1739 |
constants.URLS = selected_urls
|
| 1740 |
|
| 1741 |
try:
|
| 1742 |
# Пересоздаем базу знаний
|
| 1743 |
success, message = create_vector_store(mode="rebuild")
|
| 1744 |
|
| 1745 |
-
# Сохраняем метаданные
|
| 1746 |
if success:
|
| 1747 |
-
# Создаем метаданные с текущей датой и выбранными URL
|
| 1748 |
metadata = {
|
| 1749 |
"last_updated": datetime.datetime.now().isoformat(),
|
| 1750 |
"source_count": len(selected_urls),
|
|
@@ -1763,11 +1806,13 @@ def rebuild_kb_with_selected(sources_df):
|
|
| 1763 |
)
|
| 1764 |
|
| 1765 |
return message
|
|
|
|
| 1766 |
finally:
|
| 1767 |
# Восстанавливаем оригинальные URL
|
| 1768 |
constants.URLS = original_urls
|
| 1769 |
|
| 1770 |
except Exception as e:
|
|
|
|
| 1771 |
return f"Error rebuilding knowledge base: {str(e)}"
|
| 1772 |
|
| 1773 |
def save_kb_metadata():
|
|
|
|
| 72 |
logger.error(f"Error updating knowledge base: {str(e)}")
|
| 73 |
return f"Error updating knowledge base: {str(e)}"
|
| 74 |
|
| 75 |
+
def rebuild_kb_with_selected(sources_df):
|
| 76 |
+
"""Rebuild knowledge base from scratch using only selected URLs"""
|
| 77 |
+
try:
|
| 78 |
+
selected_urls = get_selected_urls(sources_df)
|
| 79 |
+
|
| 80 |
+
if not selected_urls:
|
| 81 |
+
return "Error: No URLs selected for inclusion"
|
| 82 |
+
|
| 83 |
+
# Временно заменяем URLS на выбранные URL
|
| 84 |
+
original_urls = constants.URLS.copy()
|
| 85 |
+
constants.URLS = selected_urls
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
# Пересоздаем базу знаний
|
| 89 |
+
success, message = create_vector_store(mode="rebuild")
|
| 90 |
+
|
| 91 |
+
# Сохраняем метаданные если успешно
|
| 92 |
+
if success:
|
| 93 |
+
metadata = {
|
| 94 |
+
"last_updated": datetime.datetime.now().isoformat(),
|
| 95 |
+
"source_count": len(selected_urls),
|
| 96 |
+
"sources": selected_urls
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Сохраняем в датасет
|
| 100 |
+
json_content = json.dumps(metadata, indent=2).encode('utf-8')
|
| 101 |
+
api = HfApi(token=HF_TOKEN)
|
| 102 |
+
|
| 103 |
+
api.upload_file(
|
| 104 |
+
path_or_fileobj=json_content,
|
| 105 |
+
path_in_repo="vector_store/metadata.json",
|
| 106 |
+
repo_id=DATASET_ID,
|
| 107 |
+
repo_type="dataset"
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
return message
|
| 111 |
+
|
| 112 |
+
finally:
|
| 113 |
+
# Восстанавливаем оригинальные URL
|
| 114 |
+
constants.URLS = original_urls
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Error rebuilding knowledge base: {str(e)}")
|
| 118 |
+
return f"Error rebuilding knowledge base: {str(e)}"
|
| 119 |
+
|
| 120 |
# Set seed for consistent results
|
| 121 |
langdetect.DetectorFactory.seed = 0
|
| 122 |
|
|
|
|
| 1779 |
return "Error: No URLs selected for inclusion"
|
| 1780 |
|
| 1781 |
# Временно заменяем URLS на выбранные URL
|
| 1782 |
+
original_urls = constants.URLS.copy()
|
|
|
|
| 1783 |
constants.URLS = selected_urls
|
| 1784 |
|
| 1785 |
try:
|
| 1786 |
# Пересоздаем базу знаний
|
| 1787 |
success, message = create_vector_store(mode="rebuild")
|
| 1788 |
|
| 1789 |
+
# Сохраняем метаданные если успешно
|
| 1790 |
if success:
|
|
|
|
| 1791 |
metadata = {
|
| 1792 |
"last_updated": datetime.datetime.now().isoformat(),
|
| 1793 |
"source_count": len(selected_urls),
|
|
|
|
| 1806 |
)
|
| 1807 |
|
| 1808 |
return message
|
| 1809 |
+
|
| 1810 |
finally:
|
| 1811 |
# Восстанавливаем оригинальные URL
|
| 1812 |
constants.URLS = original_urls
|
| 1813 |
|
| 1814 |
except Exception as e:
|
| 1815 |
+
logger.error(f"Error rebuilding knowledge base: {str(e)}")
|
| 1816 |
return f"Error rebuilding knowledge base: {str(e)}"
|
| 1817 |
|
| 1818 |
def save_kb_metadata():
|
src/knowledge_base/dataset.py
CHANGED
|
@@ -45,17 +45,17 @@ def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
|
|
| 45 |
"""Download vector store from dataset"""
|
| 46 |
try:
|
| 47 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 48 |
-
|
| 49 |
|
| 50 |
-
# Download files to temporary directory
|
| 51 |
try:
|
|
|
|
| 52 |
index_path = self.api.hf_hub_download(
|
| 53 |
repo_id=self.dataset_name,
|
| 54 |
filename="vector_store/index.faiss",
|
| 55 |
repo_type="dataset",
|
| 56 |
local_dir=temp_dir
|
| 57 |
)
|
| 58 |
-
|
| 59 |
|
| 60 |
config_path = self.api.hf_hub_download(
|
| 61 |
repo_id=self.dataset_name,
|
|
@@ -63,27 +63,26 @@ def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
|
|
| 63 |
repo_type="dataset",
|
| 64 |
local_dir=temp_dir
|
| 65 |
)
|
| 66 |
-
|
| 67 |
|
| 68 |
-
#
|
| 69 |
-
if not os.path.exists(index_path) or not os.path.exists(config_path):
|
| 70 |
-
return False, f"Downloaded files not found at {temp_dir}"
|
| 71 |
-
|
| 72 |
-
# Load vector store from temporary directory
|
| 73 |
embeddings = HuggingFaceEmbeddings(
|
| 74 |
model_name=EMBEDDING_MODEL,
|
| 75 |
model_kwargs={'device': 'cpu'}
|
| 76 |
)
|
| 77 |
|
|
|
|
| 78 |
vector_store = FAISS.load_local(
|
| 79 |
-
os.path.join(temp_dir, "vector_store"),
|
| 80 |
-
embeddings
|
| 81 |
)
|
| 82 |
|
| 83 |
return True, vector_store
|
| 84 |
|
| 85 |
except Exception as e:
|
|
|
|
| 86 |
return False, f"Error downloading vector store: {str(e)}"
|
|
|
|
| 87 |
except Exception as e:
|
| 88 |
logger.error(f"Error in download_vector_store: {str(e)}")
|
| 89 |
return False, str(e)
|
|
|
|
| 45 |
"""Download vector store from dataset"""
|
| 46 |
try:
|
| 47 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 48 |
+
logger.debug(f"Downloading to temporary directory: {temp_dir}")
|
| 49 |
|
|
|
|
| 50 |
try:
|
| 51 |
+
# Download vector store files
|
| 52 |
index_path = self.api.hf_hub_download(
|
| 53 |
repo_id=self.dataset_name,
|
| 54 |
filename="vector_store/index.faiss",
|
| 55 |
repo_type="dataset",
|
| 56 |
local_dir=temp_dir
|
| 57 |
)
|
| 58 |
+
logger.debug(f"Downloaded index.faiss to: {index_path}")
|
| 59 |
|
| 60 |
config_path = self.api.hf_hub_download(
|
| 61 |
repo_id=self.dataset_name,
|
|
|
|
| 63 |
repo_type="dataset",
|
| 64 |
local_dir=temp_dir
|
| 65 |
)
|
| 66 |
+
logger.debug(f"Downloaded index.pkl to: {config_path}")
|
| 67 |
|
| 68 |
+
# Initialize embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
embeddings = HuggingFaceEmbeddings(
|
| 70 |
model_name=EMBEDDING_MODEL,
|
| 71 |
model_kwargs={'device': 'cpu'}
|
| 72 |
)
|
| 73 |
|
| 74 |
+
# Load vector store
|
| 75 |
vector_store = FAISS.load_local(
|
| 76 |
+
folder_path=os.path.join(temp_dir, "vector_store"),
|
| 77 |
+
embeddings=embeddings
|
| 78 |
)
|
| 79 |
|
| 80 |
return True, vector_store
|
| 81 |
|
| 82 |
except Exception as e:
|
| 83 |
+
logger.error(f"Error downloading vector store: {str(e)}")
|
| 84 |
return False, f"Error downloading vector store: {str(e)}"
|
| 85 |
+
|
| 86 |
except Exception as e:
|
| 87 |
logger.error(f"Error in download_vector_store: {str(e)}")
|
| 88 |
return False, str(e)
|