dgmos commited on
Commit
9bb5aaa
ยท
1 Parent(s): 91903f0

Update app.py and requirements.txt with OCR support

Browse files
Files changed (1) hide show
  1. app.py +8 -14
app.py CHANGED
@@ -1,30 +1,27 @@
1
 
2
  import os
3
- from langchain_community.document_loaders import UnstructuredPDFLoader, CSVLoader # deprecated ํ•ด๊ฒฐ
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceEmbeddings # deprecated ํ•ด๊ฒฐ
6
- from langchain_community.vectorstores import FAISS # deprecated ํ•ด๊ฒฐ
7
- from langchain_huggingface import HuggingFaceEndpoint # deprecated ํ•ด๊ฒฐ (HuggingFaceHub ๋Œ€์ฒด)
8
  from langchain.chains import RetrievalQA
9
  import gradio as gr
10
  from PIL import Image
11
  import pytesseract
12
 
13
- # 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ • (HF Space Secrets์—์„œ ํ† ํฐ ๋กœ๋“œ)
14
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
15
  raise ValueError("HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. HF Space Settings > Secrets์—์„œ ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
16
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
17
 
18
- # 2. ๋ชจ๋ธ ์„ค์ • (Llama 3.2-3B-Instruct, ํ…์ŠคํŠธ ํƒœ์Šคํฌ)
19
- repo_id = "meta-llama/Llama-3.2-3B-Instruct" # Vision ํƒœ์Šคํฌ ์—๋Ÿฌ ํ•ด๊ฒฐ, ๋ฌด๋ฃŒ ํ‹ฐ์–ด ํ˜ธํ™˜
20
  llm = HuggingFaceEndpoint(
21
  repo_id=repo_id,
22
  huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
23
  temperature=0.7,
24
- task="text-generation" # ํƒœ์Šคํฌ ๋ช…์‹œ
25
  )
26
 
27
- # 3. ๋ฌธ์„œ ๋กœ๋“œ ํ•จ์ˆ˜ (PDF/CSV/์ด๋ฏธ์ง€)
28
  def load_documents(files):
29
  docs = []
30
  for file_path in files:
@@ -36,7 +33,7 @@ def load_documents(files):
36
  docs.extend(loader.load())
37
  elif file_path.endswith((".jpg", ".png")):
38
  img = Image.open(file_path)
39
- text = pytesseract.image_to_string(img, lang="kor+eng") # ํ•œ๊ตญ์–ด+์˜์–ด OCR
40
  docs.append(text)
41
  else:
42
  continue
@@ -46,7 +43,6 @@ def load_documents(files):
46
  vectorstore = FAISS.from_documents(texts, embeddings)
47
  return vectorstore
48
 
49
- # 4. RAG ์ฒด์ธ ์ƒ์„ฑ
50
  def create_rag_chain(vectorstore):
51
  qa_chain = RetrievalQA.from_chain_type(
52
  llm=llm,
@@ -55,7 +51,6 @@ def create_rag_chain(vectorstore):
55
  )
56
  return qa_chain
57
 
58
- # 5. ์ฑ—๋ด‡ ํ•จ์ˆ˜
59
  def chatbot(query, files):
60
  if not files:
61
  return "ํŒŒ์ผ ์—…๋กœ๋“œ ํ•„์š” (PDF/CSV/์ด๋ฏธ์ง€/PPTX)."
@@ -67,7 +62,6 @@ def chatbot(query, files):
67
  except Exception as e:
68
  return f"์˜ค๋ฅ˜: {str(e)}."
69
 
70
- # 6. Gradio UI
71
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
72
  gr.Markdown("# ๐Ÿš€ 3G/LTE/5G ์žฅ๋น„ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
73
  gr.Markdown("PDF/CSV/์ด๋ฏธ์ง€ ์—…๋กœ๋“œ ํ›„ ์งˆ๋ฌธ: e.g., 'Spurious Emission ํ†ต๊ณ„?'")
@@ -78,4 +72,4 @@ with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
78
  btn.click(chatbot, inputs=[query, files], outputs=output)
79
 
80
  if __name__ == "__main__":
81
- demo.launch(server_name="0.0.0.0", server_port=7860) # HF Space ํ˜ธํ™˜
 
1
 
2
  import os
3
+ from langchain_community.document_loaders import UnstructuredPDFLoader, CSVLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_huggingface import HuggingFaceEndpoint
8
  from langchain.chains import RetrievalQA
9
  import gradio as gr
10
  from PIL import Image
11
  import pytesseract
12
 
 
13
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
14
  raise ValueError("HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. HF Space Settings > Secrets์—์„œ ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
15
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
16
 
17
+ repo_id = "meta-llama/Llama-3.2-3B-Instruct"
 
18
  llm = HuggingFaceEndpoint(
19
  repo_id=repo_id,
20
  huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
21
  temperature=0.7,
22
+ task="text-generation"
23
  )
24
 
 
25
  def load_documents(files):
26
  docs = []
27
  for file_path in files:
 
33
  docs.extend(loader.load())
34
  elif file_path.endswith((".jpg", ".png")):
35
  img = Image.open(file_path)
36
+ text = pytesseract.image_to_string(img, lang="kor+eng")
37
  docs.append(text)
38
  else:
39
  continue
 
43
  vectorstore = FAISS.from_documents(texts, embeddings)
44
  return vectorstore
45
 
 
46
  def create_rag_chain(vectorstore):
47
  qa_chain = RetrievalQA.from_chain_type(
48
  llm=llm,
 
51
  )
52
  return qa_chain
53
 
 
54
  def chatbot(query, files):
55
  if not files:
56
  return "ํŒŒ์ผ ์—…๋กœ๋“œ ํ•„์š” (PDF/CSV/์ด๋ฏธ์ง€/PPTX)."
 
62
  except Exception as e:
63
  return f"์˜ค๋ฅ˜: {str(e)}."
64
 
 
65
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
66
  gr.Markdown("# ๐Ÿš€ 3G/LTE/5G ์žฅ๋น„ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
67
  gr.Markdown("PDF/CSV/์ด๋ฏธ์ง€ ์—…๋กœ๋“œ ํ›„ ์งˆ๋ฌธ: e.g., 'Spurious Emission ํ†ต๊ณ„?'")
 
72
  btn.click(chatbot, inputs=[query, files], outputs=output)
73
 
74
  if __name__ == "__main__":
75
+ demo.launch(server_name="0.0.0.0", server_port=7860)