| from llama_index.core import SimpleDirectoryReader, get_response_synthesizer | |
| from llama_index.core import DocumentSummaryIndex | |
| from llama_index.llms.ollama import Ollama | |
| from llama_index.core.node_parser import SentenceSplitter | |
| from langchain_community.embeddings import OllamaEmbeddings | |
| from llama_index.core import Settings | |
| import nest_asyncio | |
| nest_asyncio.apply() | |
| # Load data | |
| react_doc = SimpleDirectoryReader(input_dir="../data/pdf", file_metadata={"category": "AI applications"}).load_data() | |
| llm = Ollama(model="llama3", request_timeout=120, base_url="http://localhost:11434") | |
| embed_model = OllamaEmbeddings(model="llama3") | |
| Settings.llm = llm | |
| Settings.embed_model = embed_model | |
| # Text Splitter | |
| sentence_splitter = SentenceSplitter(chunk_size=500, chunk_overlap=0) | |
| # response_mode: 检索返回node的模式 | |
| response_synthesizer = get_response_synthesizer( | |
| response_mode="tree_summarize", | |
| use_async=True, | |
| ) | |
| # Buid index: 建立索引,transformations可以指定对文档进行预处理,response_synthesizer指定检索返回node的模式 | |
| index = DocumentSummaryIndex.from_documents( | |
| react_doc, | |
| llm=llm, | |
| transformations=[sentence_splitter], | |
| response_synthesizer=response_synthesizer, | |
| show_progress=True, | |
| ) | |
| # 存储index | |
| index.storage_context.persist("../kb/index") | |