dgmos commited on
Commit
6527359
Β·
1 Parent(s): a979c0b

Update app.py and requirements.txt with OCR support

Browse files
Files changed (1) hide show
  1. app.py +18 -40
app.py CHANGED
@@ -1,6 +1,6 @@
1
 
2
  import os
3
- from langchain_community.document_loaders import UnstructuredPDFLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
@@ -22,51 +22,29 @@ llm = HuggingFaceEndpoint(
22
  task="text-generation"
23
  )
24
 
25
- # 3. GitHub μ €μž₯μ†Œμ—μ„œ PDF μžλ™ λ‘œλ”© 및 벑터 DB 생성
26
- vectorstore_path = "/app/data/chatbot_db/index.faiss"
27
 
28
- if not os.path.exists(vectorstore_path):
29
- print("πŸ“‚ GitHub μ €μž₯μ†Œ PDF μžλ™ λ‘œλ”© μ‹œμž‘...")
30
-
31
- # PDF 폴더 경둜 (Hugging Face Spaceμ—μ„œμ˜ 경둜)
32
- pdf_folder = "/app/data/manuals/"
33
-
34
- if not os.path.exists(pdf_folder):
35
- raise FileNotFoundError(f"{pdf_folder} 폴더가 μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€. λ¨Όμ € PDF νŒŒμΌμ„ GitHub μ €μž₯μ†Œμ— μ—…λ‘œλ“œν•˜μ„Έμš”.")
36
-
37
- # λͺ¨λ“  PDF 파일 λ‘œλ“œ
38
- docs = []
39
- for filename in os.listdir(pdf_folder):
40
- if filename.endswith(".pdf"):
41
- file_path = os.path.join(pdf_folder, filename)
42
- loader = UnstructuredPDFLoader(file_path)
43
- docs.extend(loader.load())
44
-
45
- # ν…μŠ€νŠΈ λΆ„ν• 
46
- splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
47
- texts = splitter.split_documents(docs)
48
-
49
- # μž„λ² λ”© 및 벑터 DB 생성
50
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
51
- vectorstore = FAISS.from_documents(texts, embeddings)
52
-
53
- # 벑터 DB μ €μž₯ (λ‹€μŒ μ‹€ν–‰ μ‹œ μž¬μ‚¬μš©)
54
- vectorstore.save_local("/app/data/chatbot_db")
55
- print("βœ… GitHub μ €μž₯μ†Œ PDF μžλ™ λ‘œλ”© μ™„λ£Œ!")
56
- else:
57
- print("πŸ“‚ μ €μž₯된 벑터 DB 발견. λ‘œλ”© 쀑...")
58
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
59
- vectorstore = FAISS.load_local("/app/data/chatbot_db", embeddings, allow_dangerous_deserialization=True)
60
- print("βœ… μ €μž₯된 벑터 DB λ‘œλ”© μ™„λ£Œ!")
61
 
62
- # 4. RAG 체인 생성
 
 
 
 
 
 
63
  qa_chain = RetrievalQA.from_chain_type(
64
  llm=llm,
65
  chain_type="stuff",
66
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
67
  )
68
 
69
- # 5. 챗봇 ν•¨μˆ˜ (파일 μ—…λ‘œλ“œ 없이 질문만 λ°›μŒ)
70
  def chatbot(query):
71
  try:
72
  response = qa_chain.run(query)
@@ -74,10 +52,10 @@ def chatbot(query):
74
  except Exception as e:
75
  return f"였λ₯˜: {str(e)}."
76
 
77
- # 6. Gradio UI
78
  with gr.Blocks(title="Ericsson μž₯λΉ„ 뢄석 챗봇") as demo:
79
  gr.Markdown("# πŸš€ 3G/LTE/5G μž₯λΉ„ λΆˆλŸ‰/λΆˆμš”νŒŒ 뢄석 챗봇")
80
- gr.Markdown("GitHub μ €μž₯μ†Œμ— 미리 μ—…λ‘œλ“œν•œ PDFλ₯Ό 기반으둜 질문만 μž…λ ₯ν•˜μ„Έμš”!")
81
  query = gr.Textbox(label="질문 (ν•œκ΅­μ–΄/μ˜μ–΄)", placeholder="Spurious Emission 원인은?")
82
  output = gr.Textbox(label="응닡", lines=10)
83
  btn = gr.Button("뢄석 μ‹œμž‘!")
 
1
 
2
  import os
3
+ from datasets import load_dataset
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
 
22
  task="text-generation"
23
  )
24
 
25
+ # 3. Hugging Face Dataset λ‘œλ“œ
26
+ dataset = load_dataset("dgmos/ericsson-manuals", split="train")
27
 
28
+ # 4. ν…μŠ€νŠΈ μΆ”μΆœ 및 벑터 DB 생성
29
+ docs = []
30
+ for item in dataset:
31
+ text = item["text"]
32
+ docs.append(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
35
+ texts = splitter.split_documents(docs)
36
+
37
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
38
+ vectorstore = FAISS.from_documents(texts, embeddings)
39
+
40
+ # 5. RAG 체인 생성
41
  qa_chain = RetrievalQA.from_chain_type(
42
  llm=llm,
43
  chain_type="stuff",
44
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
45
  )
46
 
47
+ # 6. 챗봇 ν•¨μˆ˜
48
  def chatbot(query):
49
  try:
50
  response = qa_chain.run(query)
 
52
  except Exception as e:
53
  return f"였λ₯˜: {str(e)}."
54
 
55
+ # 7. Gradio UI
56
  with gr.Blocks(title="Ericsson μž₯λΉ„ 뢄석 챗봇") as demo:
57
  gr.Markdown("# πŸš€ 3G/LTE/5G μž₯λΉ„ λΆˆλŸ‰/λΆˆμš”νŒŒ 뢄석 챗봇")
58
+ gr.Markdown("Hugging Face Datasetμ—μ„œ λ‘œλ“œν•œ PDFλ₯Ό 기반으둜 질문만 μž…λ ₯ν•˜μ„Έμš”!")
59
  query = gr.Textbox(label="질문 (ν•œκ΅­μ–΄/μ˜μ–΄)", placeholder="Spurious Emission 원인은?")
60
  output = gr.Textbox(label="응닡", lines=10)
61
  btn = gr.Button("뢄석 μ‹œμž‘!")