Spaces:
Running
Running
File size: 1,617 Bytes
e37fa39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import pandas as pd
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
def load_and_split_markdown(filepath='https://drive.google.com/u/0/uc?id=1JQswhvNz6yNKKzJW0nrXU7AmUQaGevxA&export=download'):
# Загрузка данных
data_cities = pd.read_csv(filepath)
# Создание документов без прогресс-бара
RAW_KNOWLEDGE_BASE = [
LangchainDocument(
page_content=f"{row['City']} | {row['Name']} | {row['description']}",
metadata={
"longitude": row['Lon'],
"latitude": row['Lat'],
"image": row['image'],
# "english_description": row['en_txt']
}
)
for _, row in data_cities.iterrows() # Убрали tqdm
]
# Настройки разделителя текста
MARKDOWN_SEPARATORS = [
"\n#{1,6} ",
"```\n",
"\n\\*\\*\\*+\n",
"\n---+\n",
"\n___+\n",
"\n\n",
"\n",
" ",
"",
]
# Инициализация разделителя текста
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
add_start_index=True,
strip_whitespace=True,
separators=MARKDOWN_SEPARATORS,
)
# Разделение документов
docs_processed = []
for doc in RAW_KNOWLEDGE_BASE:
docs_processed += text_splitter.split_documents([doc])
return docs_processed |