Abc123Harsh commited on
Commit
e4cea98
·
verified ·
1 Parent(s): 972ba8f

Upload 3 files

Browse files
Files changed (3) hide show
  1. config.yml +11 -0
  2. ingest.py +31 -0
  3. l.ipynb +109 -0
config.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ RETURN_SOURCE_DOCUMENTS: True
2
+ VECTOR_COUNT: 2
3
+ CHUNK_SIZE: 300
4
+ CHUNK_OVERLAP: 30
5
+ DATA_PATH: 'data/'
6
+ DB_FAISS_PATH: 'vectorstore/db_faiss'
7
+ MODEL_TYPE: 'mistral'
8
+ MODEL_BIN_PATH: 'models\mistral-7b-instruct-v0.2.Q4_0.gguf'
9
+ EMBEDDINGS: 'sentence-transformers/all-mpnet-base-v2'
10
+ MAX_NEW_TOKENS: 256
11
+ TEMPERATURE: 0.0
ingest.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import box
2
+ import yaml
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+
8
+
9
+ # Import config vars
10
+ with open('config.yml', 'r', encoding='utf8') as ymlfile:
11
+ cfg = box.Box(yaml.safe_load(ymlfile))
12
+
13
+
14
+ def run_ingest():
15
+ loader = DirectoryLoader(cfg.DATA_PATH,
16
+ glob='*.pdf',
17
+ loader_cls=PyPDFLoader)
18
+
19
+ documents = loader.load()
20
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=cfg.CHUNK_SIZE,
21
+ chunk_overlap=cfg.CHUNK_OVERLAP)
22
+ texts = text_splitter.split_documents(documents)
23
+
24
+ embeddings = HuggingFaceEmbeddings(model_name=cfg.EMBEDDINGS,
25
+ model_kwargs={'device': 'cpu'})
26
+
27
+ vectorstore = FAISS.from_documents(texts, embeddings)
28
+ vectorstore.save_local(cfg.DB_FAISS_PATH)
29
+
30
+ if __name__ == "__main__":
31
+ run_ingest()
l.ipynb ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "[WinError 2] The system cannot find the file specified: 'llm-mistral-invoice-cpu'\n",
13
+ "c:\\Users\\Asus\\Desktop\\LLM\\llm-mistral-invoice-cpu - Copy\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "%cd llm-mistral-invoice-cpu"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 2,
24
+ "metadata": {},
25
+ "outputs": [
26
+ {
27
+ "name": "stderr",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\n"
31
+ ]
32
+ }
33
+ ],
34
+ "source": [
35
+ "!pip install -r requirements.txt -q"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 3,
41
+ "metadata": {},
42
+ "outputs": [
43
+ {
44
+ "data": {
45
+ "text/plain": [
46
+ "['Students Encyclopedia of General Knowledge- Ahmad. K.A..pdf']"
47
+ ]
48
+ },
49
+ "execution_count": 3,
50
+ "metadata": {},
51
+ "output_type": "execute_result"
52
+ }
53
+ ],
54
+ "source": [
55
+ "import os\n",
56
+ "os.listdir('data')"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 4,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "!python ingest.py"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 5,
71
+ "metadata": {},
72
+ "outputs": [
73
+ {
74
+ "name": "stdout",
75
+ "output_type": "stream",
76
+ "text": [
77
+ "\n",
78
+ "Answer: Magnetism is a physical phenomenon arising from the motion of electric charges. It causes materials to be attracted or repelled to each other. It is closely related to electricity, and together they are part of the electromagnetic force, which is one of the four fundamental forces in nature.\n",
79
+ "==================================================\n"
80
+ ]
81
+ }
82
+ ],
83
+ "source": [
84
+ "!python main.py \"what is magnetism?\""
85
+ ]
86
+ }
87
+ ],
88
+ "metadata": {
89
+ "kernelspec": {
90
+ "display_name": "Python 3",
91
+ "language": "python",
92
+ "name": "python3"
93
+ },
94
+ "language_info": {
95
+ "codemirror_mode": {
96
+ "name": "ipython",
97
+ "version": 3
98
+ },
99
+ "file_extension": ".py",
100
+ "mimetype": "text/x-python",
101
+ "name": "python",
102
+ "nbconvert_exporter": "python",
103
+ "pygments_lexer": "ipython3",
104
+ "version": "3.10.11"
105
+ }
106
+ },
107
+ "nbformat": 4,
108
+ "nbformat_minor": 2
109
+ }