Spaces:
No application file
No application file
Upload 3 files
Browse files- config.yml +11 -0
- ingest.py +31 -0
- l.ipynb +109 -0
config.yml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
RETURN_SOURCE_DOCUMENTS: True
|
| 2 |
+
VECTOR_COUNT: 2
|
| 3 |
+
CHUNK_SIZE: 300
|
| 4 |
+
CHUNK_OVERLAP: 30
|
| 5 |
+
DATA_PATH: 'data/'
|
| 6 |
+
DB_FAISS_PATH: 'vectorstore/db_faiss'
|
| 7 |
+
MODEL_TYPE: 'mistral'
|
| 8 |
+
MODEL_BIN_PATH: 'models\mistral-7b-instruct-v0.2.Q4_0.gguf'
|
| 9 |
+
EMBEDDINGS: 'sentence-transformers/all-mpnet-base-v2'
|
| 10 |
+
MAX_NEW_TOKENS: 256
|
| 11 |
+
TEMPERATURE: 0.0
|
ingest.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import box
|
| 2 |
+
import yaml
|
| 3 |
+
from langchain.vectorstores import FAISS
|
| 4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 5 |
+
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
| 6 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# Import config vars
|
| 10 |
+
with open('config.yml', 'r', encoding='utf8') as ymlfile:
|
| 11 |
+
cfg = box.Box(yaml.safe_load(ymlfile))
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def run_ingest():
|
| 15 |
+
loader = DirectoryLoader(cfg.DATA_PATH,
|
| 16 |
+
glob='*.pdf',
|
| 17 |
+
loader_cls=PyPDFLoader)
|
| 18 |
+
|
| 19 |
+
documents = loader.load()
|
| 20 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=cfg.CHUNK_SIZE,
|
| 21 |
+
chunk_overlap=cfg.CHUNK_OVERLAP)
|
| 22 |
+
texts = text_splitter.split_documents(documents)
|
| 23 |
+
|
| 24 |
+
embeddings = HuggingFaceEmbeddings(model_name=cfg.EMBEDDINGS,
|
| 25 |
+
model_kwargs={'device': 'cpu'})
|
| 26 |
+
|
| 27 |
+
vectorstore = FAISS.from_documents(texts, embeddings)
|
| 28 |
+
vectorstore.save_local(cfg.DB_FAISS_PATH)
|
| 29 |
+
|
| 30 |
+
if __name__ == "__main__":
|
| 31 |
+
run_ingest()
|
l.ipynb
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stdout",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"[WinError 2] The system cannot find the file specified: 'llm-mistral-invoice-cpu'\n",
|
| 13 |
+
"c:\\Users\\Asus\\Desktop\\LLM\\llm-mistral-invoice-cpu - Copy\n"
|
| 14 |
+
]
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"source": [
|
| 18 |
+
"%cd llm-mistral-invoice-cpu"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": 2,
|
| 24 |
+
"metadata": {},
|
| 25 |
+
"outputs": [
|
| 26 |
+
{
|
| 27 |
+
"name": "stderr",
|
| 28 |
+
"output_type": "stream",
|
| 29 |
+
"text": [
|
| 30 |
+
"DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\n"
|
| 31 |
+
]
|
| 32 |
+
}
|
| 33 |
+
],
|
| 34 |
+
"source": [
|
| 35 |
+
"!pip install -r requirements.txt -q"
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"cell_type": "code",
|
| 40 |
+
"execution_count": 3,
|
| 41 |
+
"metadata": {},
|
| 42 |
+
"outputs": [
|
| 43 |
+
{
|
| 44 |
+
"data": {
|
| 45 |
+
"text/plain": [
|
| 46 |
+
"['Students Encyclopedia of General Knowledge- Ahmad. K.A..pdf']"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
"execution_count": 3,
|
| 50 |
+
"metadata": {},
|
| 51 |
+
"output_type": "execute_result"
|
| 52 |
+
}
|
| 53 |
+
],
|
| 54 |
+
"source": [
|
| 55 |
+
"import os\n",
|
| 56 |
+
"os.listdir('data')"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"cell_type": "code",
|
| 61 |
+
"execution_count": 4,
|
| 62 |
+
"metadata": {},
|
| 63 |
+
"outputs": [],
|
| 64 |
+
"source": [
|
| 65 |
+
"!python ingest.py"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"cell_type": "code",
|
| 70 |
+
"execution_count": 5,
|
| 71 |
+
"metadata": {},
|
| 72 |
+
"outputs": [
|
| 73 |
+
{
|
| 74 |
+
"name": "stdout",
|
| 75 |
+
"output_type": "stream",
|
| 76 |
+
"text": [
|
| 77 |
+
"\n",
|
| 78 |
+
"Answer: Magnetism is a physical phenomenon arising from the motion of electric charges. It causes materials to be attracted or repelled to each other. It is closely related to electricity, and together they are part of the electromagnetic force, which is one of the four fundamental forces in nature.\n",
|
| 79 |
+
"==================================================\n"
|
| 80 |
+
]
|
| 81 |
+
}
|
| 82 |
+
],
|
| 83 |
+
"source": [
|
| 84 |
+
"!python main.py \"what is magnetism?\""
|
| 85 |
+
]
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"metadata": {
|
| 89 |
+
"kernelspec": {
|
| 90 |
+
"display_name": "Python 3",
|
| 91 |
+
"language": "python",
|
| 92 |
+
"name": "python3"
|
| 93 |
+
},
|
| 94 |
+
"language_info": {
|
| 95 |
+
"codemirror_mode": {
|
| 96 |
+
"name": "ipython",
|
| 97 |
+
"version": 3
|
| 98 |
+
},
|
| 99 |
+
"file_extension": ".py",
|
| 100 |
+
"mimetype": "text/x-python",
|
| 101 |
+
"name": "python",
|
| 102 |
+
"nbconvert_exporter": "python",
|
| 103 |
+
"pygments_lexer": "ipython3",
|
| 104 |
+
"version": "3.10.11"
|
| 105 |
+
}
|
| 106 |
+
},
|
| 107 |
+
"nbformat": 4,
|
| 108 |
+
"nbformat_minor": 2
|
| 109 |
+
}
|