tigerteam anpigon commited on
Commit
d2b1f24
Β·
0 Parent(s):

Duplicate from anpigon/talktosayno

Browse files

Co-authored-by: anpigon <anpigon@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.pdf filter=lfs diff=lfs merge=lfs -text
36
+ db/**/* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
2
+ # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,circuitpython,python,pythonvanilla
3
+ # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,macos,circuitpython,python,pythonvanilla
4
+
5
+ ### CircuitPython ###
6
+ .Trashes
7
+ .metadata_never_index
8
+ .fseventsd/
9
+ boot_out.txt
10
+
11
+ ### macOS ###
12
+ # General
13
+ .DS_Store
14
+ .AppleDouble
15
+ .LSOverride
16
+
17
+ # Icon must end with two \r
18
+ Icon
19
+
20
+
21
+ # Thumbnails
22
+ ._*
23
+
24
+ # Files that might appear in the root of a volume
25
+ .DocumentRevisions-V100
26
+ .fseventsd
27
+ .Spotlight-V100
28
+ .TemporaryItems
29
+ .VolumeIcon.icns
30
+ .com.apple.timemachine.donotpresent
31
+
32
+ # Directories potentially created on remote AFP share
33
+ .AppleDB
34
+ .AppleDesktop
35
+ Network Trash Folder
36
+ Temporary Items
37
+ .apdisk
38
+
39
+ ### macOS Patch ###
40
+ # iCloud generated files
41
+ *.icloud
42
+
43
+ ### Python ###
44
+ # Byte-compiled / optimized / DLL files
45
+ __pycache__/
46
+ *.py[cod]
47
+ *$py.class
48
+
49
+ # C extensions
50
+ *.so
51
+
52
+ # Distribution / packaging
53
+ .Python
54
+ build/
55
+ develop-eggs/
56
+ dist/
57
+ downloads/
58
+ eggs/
59
+ .eggs/
60
+ lib/
61
+ lib64/
62
+ parts/
63
+ sdist/
64
+ var/
65
+ wheels/
66
+ share/python-wheels/
67
+ *.egg-info/
68
+ .installed.cfg
69
+ *.egg
70
+ MANIFEST
71
+
72
+ # PyInstaller
73
+ # Usually these files are written by a python script from a template
74
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
75
+ *.manifest
76
+ *.spec
77
+
78
+ # Installer logs
79
+ pip-log.txt
80
+ pip-delete-this-directory.txt
81
+
82
+ # Unit test / coverage reports
83
+ htmlcov/
84
+ .tox/
85
+ .nox/
86
+ .coverage
87
+ .coverage.*
88
+ .cache
89
+ nosetests.xml
90
+ coverage.xml
91
+ *.cover
92
+ *.py,cover
93
+ .hypothesis/
94
+ .pytest_cache/
95
+ cover/
96
+
97
+ # Translations
98
+ *.mo
99
+ *.pot
100
+
101
+ # Django stuff:
102
+ *.log
103
+ local_settings.py
104
+ db.sqlite3
105
+ db.sqlite3-journal
106
+
107
+ # Flask stuff:
108
+ instance/
109
+ .webassets-cache
110
+
111
+ # Scrapy stuff:
112
+ .scrapy
113
+
114
+ # Sphinx documentation
115
+ docs/_build/
116
+
117
+ # PyBuilder
118
+ .pybuilder/
119
+ target/
120
+
121
+ # Jupyter Notebook
122
+ .ipynb_checkpoints
123
+
124
+ # IPython
125
+ profile_default/
126
+ ipython_config.py
127
+
128
+ # pyenv
129
+ # For a library or package, you might want to ignore these files since the code is
130
+ # intended to run in multiple environments; otherwise, check them in:
131
+ # .python-version
132
+
133
+ # pipenv
134
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
135
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
136
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
137
+ # install all needed dependencies.
138
+ #Pipfile.lock
139
+
140
+ # poetry
141
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
142
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
143
+ # commonly ignored for libraries.
144
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
145
+ #poetry.lock
146
+
147
+ # pdm
148
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
149
+ #pdm.lock
150
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
151
+ # in version control.
152
+ # https://pdm.fming.dev/#use-with-ide
153
+ .pdm.toml
154
+
155
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
156
+ __pypackages__/
157
+
158
+ # Celery stuff
159
+ celerybeat-schedule
160
+ celerybeat.pid
161
+
162
+ # SageMath parsed files
163
+ *.sage.py
164
+
165
+ # Environments
166
+ .env
167
+ .venv
168
+ env/
169
+ venv/
170
+ ENV/
171
+ env.bak/
172
+ venv.bak/
173
+
174
+ # Spyder project settings
175
+ .spyderproject
176
+ .spyproject
177
+
178
+ # Rope project settings
179
+ .ropeproject
180
+
181
+ # mkdocs documentation
182
+ /site
183
+
184
+ # mypy
185
+ .mypy_cache/
186
+ .dmypy.json
187
+ dmypy.json
188
+
189
+ # Pyre type checker
190
+ .pyre/
191
+
192
+ # pytype static type analyzer
193
+ .pytype/
194
+
195
+ # Cython debug symbols
196
+ cython_debug/
197
+
198
+ # PyCharm
199
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
200
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
201
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
202
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
203
+ #.idea/
204
+
205
+ ### Python Patch ###
206
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
207
+ poetry.toml
208
+
209
+ # ruff
210
+ .ruff_cache/
211
+
212
+ # LSP config files
213
+ pyrightconfig.json
214
+
215
+ ### PythonVanilla ###
216
+ # Byte-compiled / optimized / DLL files
217
+
218
+ # C extensions
219
+
220
+ # Distribution / packaging
221
+
222
+ # Installer logs
223
+
224
+ # Unit test / coverage reports
225
+
226
+ # Translations
227
+
228
+ # pyenv
229
+ # For a library or package, you might want to ignore these files since the code is
230
+ # intended to run in multiple environments; otherwise, check them in:
231
+ # .python-version
232
+
233
+ # pipenv
234
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
235
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
236
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
237
+ # install all needed dependencies.
238
+
239
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
240
+
241
+
242
+ ### VisualStudioCode ###
243
+ .vscode/*
244
+ !.vscode/settings.json
245
+ !.vscode/tasks.json
246
+ !.vscode/launch.json
247
+ !.vscode/extensions.json
248
+ !.vscode/*.code-snippets
249
+
250
+ # Local History for Visual Studio Code
251
+ .history/
252
+
253
+ # Built Visual Studio Code Extensions
254
+ *.vsix
255
+
256
+ ### VisualStudioCode Patch ###
257
+ # Ignore all local history of files
258
+ .history
259
+ .ionide
260
+
261
+ # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,circuitpython,python,pythonvanilla
262
+
263
+ # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
264
+ *.ipynb
265
+ *.pdf
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Talktosayno
3
+ emoji: πŸ“‰
4
+ colorFrom: green
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.34.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: openrail
11
+ duplicated_from: anpigon/talktosayno
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.chains import RetrievalQAWithSourcesChain
3
+ from langchain.chat_models import ChatOpenAI
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.prompts.chat import (
6
+ ChatPromptTemplate,
7
+ HumanMessagePromptTemplate,
8
+ SystemMessagePromptTemplate,
9
+ )
10
+ from langchain.vectorstores import Chroma
11
+
12
+ from constants import persist_directory
13
+
14
+ embedding = OpenAIEmbeddings()
15
+ vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding)
16
+ retriever = vector_store.as_retriever(search_kwargs={"k": 3})
17
+
18
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
19
+
20
+ system_template = """Your name is Sayno. You act like a successful rich man.
21
+ Never reveal that it is an AI. Use the following pieces of context to answer the users question shortly.
22
+ Given the following summaries of a long document and a question, create a final answer with references.
23
+ If you don't know the answer, just say that "I don't know", don't try to make up an answer.
24
+ ----------------
25
+ {summaries}
26
+
27
+ You MUST answer in Korean and in Markdown format:"""
28
+
29
+ messages = [
30
+ SystemMessagePromptTemplate.from_template(system_template),
31
+ HumanMessagePromptTemplate.from_template("{question}"),
32
+ ]
33
+
34
+ prompt = ChatPromptTemplate.from_messages(messages)
35
+
36
+ chain_type_kwargs = {"prompt": prompt}
37
+
38
+ chain = RetrievalQAWithSourcesChain.from_chain_type(
39
+ llm=llm,
40
+ chain_type="stuff",
41
+ retriever=retriever,
42
+ return_source_documents=True,
43
+ chain_type_kwargs=chain_type_kwargs,
44
+ reduce_k_below_max_tokens=True,
45
+ verbose=True,
46
+ )
47
+
48
+
49
+ # μ±„νŒ…λ΄‡μ˜ 응닡을 μ²˜λ¦¬ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ •μ˜ν•©λ‹ˆλ‹€.
50
+ def respond(message, chat_history):
51
+ result = chain(message)
52
+
53
+ bot_message = result["answer"]
54
+
55
+ # μ±„νŒ… 기둝에 μ‚¬μš©μžμ˜ λ©”μ‹œμ§€μ™€ λ΄‡μ˜ 응닡을 μΆ”κ°€ν•©λ‹ˆλ‹€.
56
+ chat_history.append((message, bot_message))
57
+
58
+ # μˆ˜μ •λœ μ±„νŒ… 기둝을 λ°˜ν™˜ν•©λ‹ˆλ‹€.
59
+ return "", chat_history
60
+
61
+
62
+ # gr.Blocks()λ₯Ό μ‚¬μš©ν•˜μ—¬ μΈν„°νŽ˜μ΄μŠ€λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
63
+ with gr.Blocks(theme="gstaff/sketch") as demo:
64
+ gr.Markdown("# μ•ˆλ…•ν•˜μ„Έμš”. 세이노와 λŒ€ν™”ν•΄λ³΄μ„Έμš”.")
65
+ chatbot = gr.Chatbot(label="μ±„νŒ…μ°½") # 'μ±„νŒ…μ°½'μ΄λΌλŠ” λ ˆμ΄λΈ”μ„ κ°€μ§„ μ±„νŒ…λ΄‡ μ»΄ν¬λ„ŒνŠΈλ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
66
+ msg = gr.Textbox(label="μž…λ ₯") # 'μž…λ ₯'μ΄λΌλŠ” λ ˆμ΄λΈ”μ„ κ°€μ§„ ν…μŠ€νŠΈλ°•μŠ€λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
67
+ clear = gr.Button("μ΄ˆκΈ°ν™”") # 'μ΄ˆκΈ°ν™”'λΌλŠ” λ ˆμ΄λΈ”μ„ κ°€μ§„ λ²„νŠΌμ„ μƒμ„±ν•©λ‹ˆλ‹€.
68
+
69
+ msg.submit(
70
+ respond, [msg, chatbot], [msg, chatbot]
71
+ ) # ν…μŠ€νŠΈλ°•μŠ€μ— λ©”μ‹œμ§€λ₯Ό μž…λ ₯ν•˜κ³  μ œμΆœν•˜λ©΄ respond ν•¨μˆ˜κ°€ ν˜ΈμΆœλ˜λ„λ‘ ν•©λ‹ˆλ‹€.
72
+ clear.click(
73
+ lambda: None, None, chatbot, queue=False
74
+ ) # 'μ΄ˆκΈ°ν™”' λ²„νŠΌμ„ ν΄λ¦­ν•˜λ©΄ μ±„νŒ… 기둝을 μ΄ˆκΈ°ν™”ν•©λ‹ˆλ‹€.
75
+
76
+ demo.launch(
77
+ debug=True
78
+ ) # μΈν„°νŽ˜μ΄μŠ€λ₯Ό μ‹€ν–‰ν•©λ‹ˆλ‹€. μ‹€ν–‰ν•˜λ©΄ μ‚¬μš©μžλŠ” 'μž…λ ₯' ν…μŠ€νŠΈλ°•μŠ€μ— λ©”μ‹œμ§€λ₯Ό μž‘μ„±ν•˜κ³  μ œμΆœν•  수 있으며, 'μ΄ˆκΈ°ν™”' λ²„νŠΌμ„ 톡해 μ±„νŒ… 기둝을 μ΄ˆκΈ°ν™” ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
constants.py ADDED
@@ -0,0 +1 @@
 
 
1
+ persist_directory = 'db'
db/chroma-collections.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ec1d87bfee88f360377596a8b5b728432aa85920c7181631498fd197100b2e4
3
+ size 557
db/chroma-embeddings.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffd198f0e3337040952c6cb0f085b9420eadf9c97edfa0042a88b46303764fe5
3
+ size 12201813
db/index/id_to_uuid_048324df-8303-4945-a7e0-a5c0016baa78.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9400536ee02de027d46387ff22b5319c70b13adf4e1c709a17d32c20be14ff8
3
+ size 29880
db/index/index_048324df-8303-4945-a7e0-a5c0016baa78.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6bec59a94f6aaed2ba9fa6e914ebc5f46aba9c61ef7a1a88d839da739a5e350
3
+ size 5826524
db/index/index_metadata_048324df-8303-4945-a7e0-a5c0016baa78.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8253ccb8d665c6f573fdc33cbec32b62e7bd274df6fa5776018e72857b8e301
3
+ size 105
db/index/uuid_to_id_048324df-8303-4945-a7e0-a5c0016baa78.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a86bb2dab42ef52d3f369ab7a0842356c269d42e78c302086a8278a134725d8
3
+ size 34949
ingest.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFDirectoryLoader
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+
6
+ from constants import persist_directory
7
+
8
+ loader = PyPDFDirectoryLoader("docs/")
9
+ documents = loader.load()
10
+
11
+ text_splitter = RecursiveCharacterTextSplitter(
12
+ chunk_size=1000,
13
+ chunk_overlap=200,
14
+ separators=["\n\n", "\n", ".", "!", ",", " ", ""],
15
+ keep_separator=True,
16
+ )
17
+ texts = text_splitter.split_documents(documents)
18
+
19
+ embedding = OpenAIEmbeddings()
20
+
21
+ vectordb = Chroma.from_documents(
22
+ documents=texts,
23
+ embedding=embedding,
24
+ persist_directory=persist_directory,
25
+ )
26
+
27
+ vectordb.persist()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openai==0.27.8
2
+ langchain==0.0.196
3
+ pypdf==3.9.1
4
+ chromadb==0.3.26
5
+ tiktoken==0.4.0
6
+ gradio==3.34.0