hyo37009 commited on
Commit
34c2cc1
ยท
1 Parent(s): 9973579
Files changed (5) hide show
  1. .idea/misc.xml +4 -0
  2. aa.py +19 -0
  3. app.py +7 -10
  4. code.py +177 -0
  5. sample.pdf +198 -0
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (talkWithME)" project-jdk-type="Python SDK" />
4
+ </project>
aa.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ docs = open("sample.pdf", "r")
2
+ doc_list = []
3
+
4
+
5
+ for file in docs:
6
+ print('file - type : ', file.type)
7
+ if file.type == 'text/plain':
8
+ # file is .txt
9
+ print("txt")
10
+ elif file.type in ['application/octet-stream', 'application/pdf']:
11
+ # file is .pdf
12
+ print("pdf")
13
+ elif file.type == 'text/csv':
14
+ # file is .csv
15
+ print("csv")
16
+ elif file.type == 'application/json':
17
+ # file is .json
18
+ print("json")
19
+
app.py CHANGED
@@ -37,9 +37,9 @@ def get_text_file(docs):
37
  temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
38
  temp_filepath = os.path.join(temp_dir.name, docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
39
  with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
40
- f.write(docs.getvalue()) # PDF ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
41
- txt_loader = TextLoader(temp_filepath) # PyPDFLoader๋ฅผ ์‚ฌ์šฉํ•ด PDF๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
42
- txt_doc = txt_loader.load() # ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
43
  return txt_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
44
 
45
 
@@ -47,8 +47,8 @@ def get_csv_file(docs):
47
  temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
48
  temp_filepath = os.path.join(temp_dir.name, docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
49
  with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
50
- f.write(docs.getvalue()) # PDF ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
51
- csv_loader = CSVLoader(temp_filepath) # PyPDFLoader๋ฅผ ์‚ฌ์šฉํ•ด PDF๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
52
  csv_doc = csv_loader.load() # ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
53
  return csv_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
54
 
@@ -57,12 +57,9 @@ def get_json_file(docs):
57
  temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
58
  temp_filepath = os.path.join(temp_dir.name, docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
59
  with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
60
- f.write(docs.getvalue()) # PDF ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
61
- loader = JSONLoader(file_path=temp_filepath, jq_schema='.messages[].content', text_content=False)
62
- # jq_schema='.messages[].content'
63
- # data = json.loads(Path(temp_filepath).read_text())
64
  data = loader.load()
65
- # raise Exception(data)
66
  return data # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
67
 
68
 
 
37
  temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
38
  temp_filepath = os.path.join(temp_dir.name, docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
39
  with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
40
+ f.write(docs.getvalue()) # txt ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
41
+ txt_loader = TextLoader(temp_filepath) # TextLoader๋ฅผ ์‚ฌ์šฉํ•ด txt๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
42
+ txt_doc = txt_loader.load() # ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
43
  return txt_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
44
 
45
 
 
47
  temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
48
  temp_filepath = os.path.join(temp_dir.name, docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
49
  with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
50
+ f.write(docs.getvalue()) # csv ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
51
+ csv_loader = CSVLoader(temp_filepath) # CSVLoader๋ฅผ ์‚ฌ์šฉํ•ด CSV๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
52
  csv_doc = csv_loader.load() # ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
53
  return csv_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
54
 
 
57
  temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
58
  temp_filepath = os.path.join(temp_dir.name, docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
59
  with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
60
+ f.write(docs.getvalue()) # JSON ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
61
+ loader = JSONLoader(file_path=temp_filepath, jq_schema='.messages[].content', text_content=False) # JSONLoader๋ฅผ ์‚ฌ์šฉํ•ด JSON๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
 
 
62
  data = loader.load()
 
63
  return data # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
64
 
65
 
code.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS, Chroma
7
+ from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ from htmlTemplates import css, bot_template, user_template
12
+ from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
+ from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
+ import tempfile # ์ž„์‹œ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜๊ธฐ ์œ„ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์ž…๋‹ˆ๋‹ค.
15
+ import os
16
+ import json
17
+ from pathlib import Path
18
+ from pprint import pprint
19
+
20
+
21
+ # PDF ๋ฌธ์„œ๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
22
+ def get_pdf_text(pdf_docs):
23
+ temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
24
+ temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
25
+ with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
26
+ f.write(pdf_docs.getvalue()) # PDF ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
27
+ pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoader๋ฅผ ์‚ฌ์šฉํ•ด PDF๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
28
+ pdf_doc = pdf_loader.load() # ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
29
+ print(pdf_doc)
30
+ return pdf_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
31
+
32
+
33
+ # ๊ณผ์ œ
34
+ # ์•„๋ž˜ ํ…์ŠคํŠธ ์ถ”์ถœ ํ•จ์ˆ˜๋ฅผ ์ž‘์„ฑ
35
+
36
+ def get_text_file(docs):
37
+ temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
38
+ temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
39
+ with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
40
+ f.write(docs.getvalue()) # PDF ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
41
+ txt_loader = TextLoader(temp_filepath) # PyPDFLoader๋ฅผ ์‚ฌ์šฉํ•ด PDF๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
42
+ txt_doc = txt_loader.load() # ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
43
+ return txt_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
44
+
45
+
46
+ def get_csv_file(docs):
47
+ temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
48
+ temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
49
+ with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
50
+ f.write(docs.getvalue()) # PDF ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
51
+ csv_loader = CSVLoader(temp_filepath) # PyPDFLoader๋ฅผ ์‚ฌ์šฉํ•ด PDF๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
52
+ csv_doc = csv_loader.load() # ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
53
+ return csv_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
54
+
55
+
56
+ def get_json_file(docs):
57
+ temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
58
+ temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
59
+ with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ์„ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
60
+ f.write(docs.getvalue()) # PDF ๋ฌธ์„œ์˜ ๋‚ด์šฉ์„ ์ž„์‹œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
61
+ data = json.loads(Path(temp_filepath).read_text())
62
+ return data # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
63
+
64
+
65
+ # ๋ฌธ์„œ๋“ค์„ ์ฒ˜๋ฆฌํ•˜์—ฌ ํ…์ŠคํŠธ ์ฒญํฌ๋กœ ๋‚˜๋ˆ„๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
66
+ def get_text_chunks(documents):
67
+ text_splitter = RecursiveCharacterTextSplitter(
68
+ chunk_size=1000, # ์ฒญํฌ์˜ ํฌ๊ธฐ๋ฅผ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
69
+ chunk_overlap=200, # ์ฒญํฌ ์‚ฌ์ด์˜ ์ค‘๋ณต์„ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
70
+ length_function=len # ํ…์ŠคํŠธ์˜ ๊ธธ์ด๋ฅผ ์ธก์ •ํ•˜๋Š” ํ•จ์ˆ˜๋ฅผ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
71
+ )
72
+
73
+ documents = text_splitter.split_documents(documents) # ๋ฌธ์„œ๋“ค์„ ์ฒญํฌ๋กœ ๋‚˜๋ˆ•๋‹ˆ๋‹ค
74
+ return documents # ๋‚˜๋ˆˆ ์ฒญํฌ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
75
+
76
+
77
+ # ํ…์ŠคํŠธ ์ฒญํฌ๋“ค๋กœ๋ถ€ํ„ฐ ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์ƒ์„ฑํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
78
+ def get_vectorstore(text_chunks):
79
+ # OpenAI ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค. (Embedding models - Ada v2)
80
+
81
+ embeddings = OpenAIEmbeddings()
82
+ vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
83
+
84
+ return vectorstore # ์ƒ์„ฑ๋œ ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
85
+
86
+
87
+ def get_conversation_chain(vectorstore):
88
+ gpt_model_name = 'gpt-3.5-turbo'
89
+ llm = ChatOpenAI(model_name=gpt_model_name) # gpt-3.5 ๋ชจ๋ธ ๋กœ๋“œ
90
+
91
+ # ๋Œ€ํ™” ๊ธฐ๋ก์„ ์ €์žฅํ•˜๊ธฐ ์œ„ํ•œ ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
92
+ memory = ConversationBufferMemory(
93
+ memory_key='chat_history', return_messages=True)
94
+ # ๋Œ€ํ™” ๊ฒ€์ƒ‰ ์ฒด์ธ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
95
+ conversation_chain = ConversationalRetrievalChain.from_llm(
96
+ llm=llm,
97
+ retriever=vectorstore.as_retriever(),
98
+ memory=memory
99
+ )
100
+ return conversation_chain
101
+
102
+
103
+ # ์‚ฌ์šฉ์ž ์ž…๋ ฅ์„ ์ฒ˜๋ฆฌํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
104
+ def handle_userinput(user_question):
105
+ # ๋Œ€ํ™” ์ฒด์ธ์„ ์‚ฌ์šฉํ•˜์—ฌ ์‚ฌ์šฉ์ž ์งˆ๋ฌธ์— ๋Œ€ํ•œ ์‘๋‹ต์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
106
+ response = st.session_state.conversation({'question': user_question})
107
+ # ๋Œ€ํ™” ๊ธฐ๋ก์„ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
108
+ st.session_state.chat_history = response['chat_history']
109
+
110
+ for i, message in enumerate(st.session_state['chat_history']):
111
+ if i % 2 == 0:
112
+ st.write(user_template.replace(
113
+ "{{MSG}}", message.content), unsafe_allow_html=True)
114
+ else:
115
+ st.write(bot_template.replace(
116
+ "{{MSG}}", message.content), unsafe_allow_html=True)
117
+
118
+
119
+ def main():
120
+ load_dotenv()
121
+ st.set_page_config(page_title="Chat with multiple Files",
122
+ page_icon=":books:")
123
+ st.write(css, unsafe_allow_html=True)
124
+
125
+ if "conversation" not in st.session_state:
126
+ st.session_state.conversation = None
127
+ st.session_state.chat_history = None
128
+ if "chat_history" not in st.session_state:
129
+ st.session_state.conversation = None
130
+ st.session_state.chat_history = None
131
+
132
+ st.header("Chat with multiple Files :")
133
+ user_question = st.text_input("Ask a question about your documents:")
134
+ if user_question:
135
+ handle_userinput(user_question)
136
+
137
+ with st.sidebar:
138
+ openai_key = st.text_input("Paste your OpenAI API key (sk-...)")
139
+ if openai_key:
140
+ os.environ["OPENAI_API_KEY"] = openai_key
141
+
142
+ st.subheader("Your documents")
143
+ docs = st.file_uploader(
144
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
145
+ if st.button("Process"):
146
+ with st.spinner("Processing"):
147
+ # get pdf text
148
+ doc_list = []
149
+
150
+ for file in docs:
151
+ print('file - type : ', file.type)
152
+ if file.type == 'text/plain':
153
+ # file is .txt
154
+ doc_list.extend(get_text_file(file))
155
+ elif file.type in ['application/octet-stream', 'application/pdf']:
156
+ # file is .pdf
157
+ doc_list.extend(get_pdf_text(file))
158
+ elif file.type == 'text/csv':
159
+ # file is .csv
160
+ doc_list.extend(get_csv_file(file))
161
+ elif file.type == 'application/json':
162
+ # file is .json
163
+ doc_list.extend(get_json_file(file))
164
+
165
+ # get the text chunks
166
+ text_chunks = get_text_chunks(doc_list)
167
+
168
+ # create vector store
169
+ vectorstore = get_vectorstore(text_chunks)
170
+
171
+ # create conversation chain
172
+ st.session_state.conversation = get_conversation_chain(
173
+ vectorstore)
174
+
175
+
176
+ if __name__ == '__main__':
177
+ main()
sample.pdf ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %PDF-1.3
2
+ %๏ฟฝ๏ฟฝ๏ฟฝ๏ฟฝ
3
+
4
+ 1 0 obj
5
+ <<
6
+ /Type /Catalog
7
+ /Outlines 2 0 R
8
+ /Pages 3 0 R
9
+ >>
10
+ endobj
11
+
12
+ 2 0 obj
13
+ <<
14
+ /Type /Outlines
15
+ /Count 0
16
+ >>
17
+ endobj
18
+
19
+ 3 0 obj
20
+ <<
21
+ /Type /Pages
22
+ /Count 2
23
+ /Kids [ 4 0 R 6 0 R ]
24
+ >>
25
+ endobj
26
+
27
+ 4 0 obj
28
+ <<
29
+ /Type /Page
30
+ /Parent 3 0 R
31
+ /Resources <<
32
+ /Font <<
33
+ /F1 9 0 R
34
+ >>
35
+ /ProcSet 8 0 R
36
+ >>
37
+ /MediaBox [0 0 612.0000 792.0000]
38
+ /Contents 5 0 R
39
+ >>
40
+ endobj
41
+
42
+ 5 0 obj
43
+ << /Length 1074 >>
44
+ stream
45
+ 2 J
46
+ BT
47
+ 0 0 0 rg
48
+ /F1 0027 Tf
49
+ 57.3750 722.2800 Td
50
+ ( A Simple PDF File ) Tj
51
+ ET
52
+ BT
53
+ /F1 0010 Tf
54
+ 69.2500 688.6080 Td
55
+ ( This is a small demonstration .pdf file - ) Tj
56
+ ET
57
+ BT
58
+ /F1 0010 Tf
59
+ 69.2500 664.7040 Td
60
+ ( just for use in the Virtual Mechanics tutorials. More text. And more ) Tj
61
+ ET
62
+ BT
63
+ /F1 0010 Tf
64
+ 69.2500 652.7520 Td
65
+ ( text. And more text. And more text. And more text. ) Tj
66
+ ET
67
+ BT
68
+ /F1 0010 Tf
69
+ 69.2500 628.8480 Td
70
+ ( And more text. And more text. And more text. And more text. And more ) Tj
71
+ ET
72
+ BT
73
+ /F1 0010 Tf
74
+ 69.2500 616.8960 Td
75
+ ( text. And more text. Boring, zzzzz. And more text. And more text. And ) Tj
76
+ ET
77
+ BT
78
+ /F1 0010 Tf
79
+ 69.2500 604.9440 Td
80
+ ( more text. And more text. And more text. And more text. And more text. ) Tj
81
+ ET
82
+ BT
83
+ /F1 0010 Tf
84
+ 69.2500 592.9920 Td
85
+ ( And more text. And more text. ) Tj
86
+ ET
87
+ BT
88
+ /F1 0010 Tf
89
+ 69.2500 569.0880 Td
90
+ ( And more text. And more text. And more text. And more text. And more ) Tj
91
+ ET
92
+ BT
93
+ /F1 0010 Tf
94
+ 69.2500 557.1360 Td
95
+ ( text. And more text. And more text. Even more. Continued on page 2 ...) Tj
96
+ ET
97
+ endstream
98
+ endobj
99
+
100
+ 6 0 obj
101
+ <<
102
+ /Type /Page
103
+ /Parent 3 0 R
104
+ /Resources <<
105
+ /Font <<
106
+ /F1 9 0 R
107
+ >>
108
+ /ProcSet 8 0 R
109
+ >>
110
+ /MediaBox [0 0 612.0000 792.0000]
111
+ /Contents 7 0 R
112
+ >>
113
+ endobj
114
+
115
+ 7 0 obj
116
+ << /Length 676 >>
117
+ stream
118
+ 2 J
119
+ BT
120
+ 0 0 0 rg
121
+ /F1 0027 Tf
122
+ 57.3750 722.2800 Td
123
+ ( Simple PDF File 2 ) Tj
124
+ ET
125
+ BT
126
+ /F1 0010 Tf
127
+ 69.2500 688.6080 Td
128
+ ( ...continued from page 1. Yet more text. And more text. And more text. ) Tj
129
+ ET
130
+ BT
131
+ /F1 0010 Tf
132
+ 69.2500 676.6560 Td
133
+ ( And more text. And more text. And more text. And more text. And more ) Tj
134
+ ET
135
+ BT
136
+ /F1 0010 Tf
137
+ 69.2500 664.7040 Td
138
+ ( text. Oh, how boring typing this stuff. But not as boring as watching ) Tj
139
+ ET
140
+ BT
141
+ /F1 0010 Tf
142
+ 69.2500 652.7520 Td
143
+ ( paint dry. And more text. And more text. And more text. And more text. ) Tj
144
+ ET
145
+ BT
146
+ /F1 0010 Tf
147
+ 69.2500 640.8000 Td
148
+ ( Boring. More, a little more text. The end, and just as well. ) Tj
149
+ ET
150
+ endstream
151
+ endobj
152
+
153
+ 8 0 obj
154
+ [/PDF /Text]
155
+ endobj
156
+
157
+ 9 0 obj
158
+ <<
159
+ /Type /Font
160
+ /Subtype /Type1
161
+ /Name /F1
162
+ /BaseFont /Helvetica
163
+ /Encoding /WinAnsiEncoding
164
+ >>
165
+ endobj
166
+
167
+ 10 0 obj
168
+ <<
169
+ /Creator (Rave \(http://www.nevrona.com/rave\))
170
+ /Producer (Nevrona Designs)
171
+ /CreationDate (D:20060301072826)
172
+ >>
173
+ endobj
174
+
175
+ xref
176
+ 0 11
177
+ 0000000000 65535 f
178
+ 0000000019 00000 n
179
+ 0000000093 00000 n
180
+ 0000000147 00000 n
181
+ 0000000222 00000 n
182
+ 0000000390 00000 n
183
+ 0000001522 00000 n
184
+ 0000001690 00000 n
185
+ 0000002423 00000 n
186
+ 0000002456 00000 n
187
+ 0000002574 00000 n
188
+
189
+ trailer
190
+ <<
191
+ /Size 11
192
+ /Root 1 0 R
193
+ /Info 10 0 R
194
+ >>
195
+
196
+ startxref
197
+ 2714
198
+ %%EOF