hallu11 commited on
Commit
2aaccd7
·
verified ·
1 Parent(s): 429bc8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -71
app.py CHANGED
@@ -1,65 +1,62 @@
1
  import os
2
- import gradio as gr
3
  import fitz # PyMuPDF
 
 
4
  from langchain_core.documents import Document
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_community.vectorstores.utils import DistanceStrategy
8
  from langchain_groq import ChatGroq
9
  from langchain.chains import RetrievalQA
10
  from langchain.prompts import PromptTemplate
11
 
12
- API_KEY = os.getenv("GROQ_API_KEY")
13
- if not API_KEY:
14
- raise ValueError("GROQ_API_KEY 환경변수가 설정되어 있지 않습니다.")
15
 
16
- from groq import Groq
17
- client = Groq(api_key=API_KEY)
18
 
19
- def extract_texts_from_pdfs(files):
 
20
  pdf_texts = []
21
- for file in files:
22
- with fitz.open(file.name) as doc:
23
- text = "".join(page.get_text() for page in doc)
24
- pdf_texts.append({"filename": file.name, "text": text})
25
- return pdf_texts
26
-
27
- def create_langchain_docs(pdf_texts):
28
- return [
29
  Document(page_content=doc["text"], metadata={"source": doc["filename"]})
30
  for doc in pdf_texts
31
  ]
32
 
 
33
  embedding_model = HuggingFaceEmbeddings(
34
  model_name="jhgan/ko-sbert-nli",
35
  model_kwargs={"device": "cpu"},
36
  encode_kwargs={"normalize_embeddings": True}
37
  )
38
 
 
39
  def filter_documents_by_keyword(docs, keyword):
40
  keyword_lower = keyword.lower()
41
  return [doc for doc in docs if keyword_lower in doc.page_content.lower()]
42
 
 
43
  def build_qa_chain(filtered_docs):
44
  if not filtered_docs:
45
  return None
46
 
47
- local_vs = FAISS.from_documents(
48
  documents=filtered_docs,
49
  embedding=embedding_model,
50
  distance_strategy=DistanceStrategy.COSINE
51
  )
52
-
53
- retriever = local_vs.as_retriever(
54
- search_type="mmr",
55
- search_kwargs={"k": 5, "lambda_mult": 0.2}
56
- )
57
-
58
  llm = ChatGroq(model_name="llama3-8b-8192", temperature=0.1)
59
 
60
  prompt = PromptTemplate(
61
  input_variables=["context", "question"],
62
- template=\"\"\"
63
  당신은 문화 프로그램에 대해 친절하고 정확하게 설명하는 한국어 도우미입니다.
64
 
65
  문서 내용:
@@ -68,9 +65,9 @@ def build_qa_chain(filtered_docs):
68
  질문: {question}
69
 
70
  지침:
71
- - 반드시 한국어로 답변해주세요
72
- - 문서에 없으면 "죄송하지만 해당 정보는 찾을 수 없습니다"라고 답변하세요
73
- \"\"\"
74
  )
75
 
76
  return RetrievalQA.from_chain_type(
@@ -81,63 +78,53 @@ def build_qa_chain(filtered_docs):
81
  return_source_documents=False
82
  )
83
 
84
- langchain_docs = []
85
- current_keyword = None
86
- current_qa_chain = None
 
87
 
88
- def chatbot_respond(question, files, keyword, chat_history):
89
- global langchain_docs, current_keyword, current_qa_chain
90
 
91
- if files and not langchain_docs:
92
- pdf_texts = extract_texts_from_pdfs(files)
93
- langchain_docs = create_langchain_docs(pdf_texts)
94
 
95
- keyword = keyword.strip()
96
- if not keyword:
97
- return "", chat_history + [("⚠️ 키워드를 입력해주세요.", "")]
98
 
99
  if keyword != current_keyword:
100
- filtered_docs = filter_documents_by_keyword(langchain_docs, keyword)
101
- current_qa_chain = build_qa_chain(filtered_docs)
102
  current_keyword = keyword
103
 
104
- if current_qa_chain is None:
105
- return "", chat_history + [(f"'{keyword}' 관련 문서를 찾을 수 없습니다.", "")]
106
 
107
- chat_history.append((question, "답변 생성 중..."))
108
- try:
109
- result = current_qa_chain({"query": question})
110
- answer = result["result"]
111
- except Exception as e:
112
- answer = f"⚠️ 오류 발생: {e}"
113
- chat_history[-1] = (question, answer)
114
  return "", chat_history
115
 
116
- def clear_chat():
117
- return []
 
 
118
 
119
- with gr.Blocks(title="문화 프로그램 Q&A 챗봇") as demo:
120
- gr.Markdown("## 📚 문화 프로그램 문서 기반 챗봇\n\n- PDF 파일을 업로드하고\n- 키워드를 입력 후\n- 질문을 하세요.")
121
 
122
- chatbot = gr.Chatbot(label="💬 챗봇 응답창", height=400)
123
- file_upload = gr.File(file_types=[".pdf"], file_count="multiple", label="📎 PDF 업로드")
124
- keyword_input = gr.Textbox(label="키워드 입력", placeholder="예: 발달장애인육사업", lines=1)
125
- user_input = gr.Textbox(label="질문 입력", placeholder="질문을 입력하세요...", lines=1)
126
- send_btn = gr.Button("질문 보내기")
127
- clear_btn = gr.Button("대화 초기화")
128
 
129
- chat_history = gr.State([])
 
 
130
 
131
- send_btn.click(
132
- fn=chatbot_respond,
133
- inputs=[user_input, file_upload, keyword_input, chat_history],
134
- outputs=[user_input, chatbot, chat_history]
135
- )
136
- user_input.submit(
137
- fn=chatbot_respond,
138
- inputs=[user_input, file_upload, keyword_input, chat_history],
139
- outputs=[user_input, chatbot, chat_history]
140
- )
141
- clear_btn.click(fn=clear_chat, outputs=chatbot)
142
 
143
  demo.launch()
 
1
  import os
 
2
  import fitz # PyMuPDF
3
+ import gradio as gr
4
+ from groq import Groq
5
  from langchain_core.documents import Document
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_community.vectorstores.utils import DistanceStrategy
9
  from langchain_groq import ChatGroq
10
  from langchain.chains import RetrievalQA
11
  from langchain.prompts import PromptTemplate
12
 
13
+ # GROQ API KEY 환경변수에서 불러오기
14
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
 
15
 
16
+ # PDF 파싱 및 문서화
17
+ all_documents = []
18
 
19
+ def load_and_extract(file_path):
20
+ global all_documents
21
  pdf_texts = []
22
+
23
+ with fitz.open(file_path) as doc:
24
+ text = "".join(page.get_text() for page in doc)
25
+ pdf_texts.append({"filename": os.path.basename(file_path), "text": text})
26
+
27
+ all_documents = [
 
 
28
  Document(page_content=doc["text"], metadata={"source": doc["filename"]})
29
  for doc in pdf_texts
30
  ]
31
 
32
+ # ✅ 임베딩 모델
33
  embedding_model = HuggingFaceEmbeddings(
34
  model_name="jhgan/ko-sbert-nli",
35
  model_kwargs={"device": "cpu"},
36
  encode_kwargs={"normalize_embeddings": True}
37
  )
38
 
39
+ # ✅ 키워드 필터링
40
  def filter_documents_by_keyword(docs, keyword):
41
  keyword_lower = keyword.lower()
42
  return [doc for doc in docs if keyword_lower in doc.page_content.lower()]
43
 
44
+ # ✅ QA 체인 생성
45
  def build_qa_chain(filtered_docs):
46
  if not filtered_docs:
47
  return None
48
 
49
+ vectorstore = FAISS.from_documents(
50
  documents=filtered_docs,
51
  embedding=embedding_model,
52
  distance_strategy=DistanceStrategy.COSINE
53
  )
54
+ retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 5, "lambda_mult": 0.2})
 
 
 
 
 
55
  llm = ChatGroq(model_name="llama3-8b-8192", temperature=0.1)
56
 
57
  prompt = PromptTemplate(
58
  input_variables=["context", "question"],
59
+ template="""
60
  당신은 문화 프로그램에 대해 친절하고 정확하게 설명하는 한국어 도우미입니다.
61
 
62
  문서 내용:
 
65
  질문: {question}
66
 
67
  지침:
68
+ - 반드시 한국어로 답변해주세요.
69
+ - 문서에 없으면 "죄송하지만 해당 정보는 찾을 수 없습니다"라고 답변하세요.
70
+ """
71
  )
72
 
73
  return RetrievalQA.from_chain_type(
 
78
  return_source_documents=False
79
  )
80
 
81
+ # 챗봇 인터페이스
82
+ chat_history = []
83
+ current_chain = None
84
+ current_keyword = ""
85
 
86
+ def handle_chat(message, keyword):
87
+ global current_chain, current_keyword
88
 
89
+ if not all_documents:
90
+ return "", [("❗ PDF 파일을 먼저 업로드해주세요.", "")]
 
91
 
92
+ if not keyword.strip():
93
+ return "", [("❗ 키워드를 입력해주세요.", "")]
 
94
 
95
  if keyword != current_keyword:
96
+ filtered = filter_documents_by_keyword(all_documents, keyword)
97
+ current_chain = build_qa_chain(filtered)
98
  current_keyword = keyword
99
 
100
+ if not current_chain:
101
+ return "", [(f"'{keyword}' 관련 문서를 찾을 수 없습니다.", "")]
102
 
103
+ response = current_chain({"query": message})
104
+ answer = response["result"]
105
+ chat_history.append((f"🙋‍♂️ {message}", f"🤖 {answer}"))
 
 
 
 
106
  return "", chat_history
107
 
108
+ def clear_history():
109
+ global chat_history
110
+ chat_history = []
111
+ return chat_history
112
 
113
+ with gr.Blocks(title="오아시스 챗봇 Musesis") as demo:
114
+ gr.Markdown("### 📚 오아시스 PDF 기반 문화 Q&A 챗봇 (Musesis)")
115
 
116
+ file_upload = gr.File(label="📎 PDF 업로드", file_types=[".pdf"], type="filepath")
117
+ chatbot = gr.Chatbot(label="대화", height=400)
118
+ keyword_input = gr.Textbox(label="🔍 키워드", placeholder="예: 단오축제, 문화학교")
119
+ question_input = gr.Textbox(label="✉️ 질문", placeholder="질문을 입력하세요", lines=2)
 
 
120
 
121
+ with gr.Row():
122
+ submit_btn = gr.Button("질문하기 💬")
123
+ clear_btn = gr.Button("대화 초기화 🧹")
124
 
125
+ file_upload.change(fn=load_and_extract, inputs=file_upload)
126
+ submit_btn.click(fn=handle_chat, inputs=[question_input, keyword_input], outputs=[question_input, chatbot])
127
+ question_input.submit(fn=handle_chat, inputs=[question_input, keyword_input], outputs=[question_input, chatbot])
128
+ clear_btn.click(fn=clear_history, outputs=chatbot)
 
 
 
 
 
 
 
129
 
130
  demo.launch()