tdecae commited on
Commit
687fccd
Β·
verified Β·
1 Parent(s): 82134da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -18
app.py CHANGED
@@ -4,15 +4,15 @@ from langchain.chains import ConversationalRetrievalChain
4
  from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
5
  from langchain.text_splitter import CharacterTextSplitter
6
  from langchain.vectorstores import Chroma
7
- from sentence_transformers import SentenceTransformer
8
  from transformers import pipeline
9
  import gradio as gr
10
 
11
- # Workaround for sqlite in HuggingFace Spaces
12
  __import__('pysqlite3')
13
  sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
14
 
15
- # πŸ“„ Load documents
16
  docs = []
17
  for f in os.listdir("multiple_docs"):
18
  if f.endswith(".pdf"):
@@ -25,26 +25,30 @@ for f in os.listdir("multiple_docs"):
25
  loader = TextLoader(os.path.join("multiple_docs", f))
26
  docs.extend(loader.load())
27
 
28
- # πŸ”— Split into chunks
29
  splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
30
  docs = splitter.split_documents(docs)
31
 
32
- # 🧠 Compute embeddings
33
- embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
34
  texts = [doc.page_content for doc in docs]
35
  metadatas = [{"id": i} for i in range(len(texts))]
36
- embeddings = embedding_model.encode(texts)
37
 
38
- # πŸ—ƒοΈ Save in Chroma vectorstore
39
- vectorstore = Chroma(persist_directory="./db")
40
- vectorstore.add_texts(texts=texts, metadatas=metadatas, embeddings=embeddings)
 
 
 
 
 
 
41
  vectorstore.persist()
42
 
43
- # πŸ€– Load free LLM with pipeline
44
- model_name = "google/flan-t5-large" # small enough for CPU
45
  generator = pipeline("text2text-generation", model=model_name, device=-1) # -1 β†’ CPU
46
 
47
- # πŸ”— Wrap the pipeline for langchain
48
  class HuggingFaceLLMWrapper:
49
  def __init__(self, generator):
50
  self.generator = generator
@@ -55,7 +59,7 @@ class HuggingFaceLLMWrapper:
55
 
56
  llm = HuggingFaceLLMWrapper(generator)
57
 
58
- # πŸ”— Create the conversational chain
59
  chain = ConversationalRetrievalChain.from_llm(
60
  llm,
61
  retriever=vectorstore.as_retriever(search_kwargs={'k': 6}),
@@ -63,7 +67,7 @@ chain = ConversationalRetrievalChain.from_llm(
63
  verbose=False
64
  )
65
 
66
- # πŸ’¬ Gradio UI
67
  chat_history = []
68
 
69
  with gr.Blocks() as demo:
@@ -71,13 +75,11 @@ with gr.Blocks() as demo:
71
  [("", "Hello, I'm Thierry Decae's chatbot. Ask me about my experience, skills, eligibility, etc.")],
72
  avatar_images=["./multiple_docs/Guest.jpg", "./multiple_docs/Thierry Picture.jpg"]
73
  )
74
- msg = gr.Textbox()
75
  clear = gr.Button("Clear")
76
 
77
  def user(query, chat_history):
78
- # convert chat history to tuples
79
  chat_history_tuples = [(m[0], m[1]) for m in chat_history]
80
- # get answer
81
  result = chain({"question": query, "chat_history": chat_history_tuples})
82
  chat_history.append((query, result["answer"]))
83
  return gr.update(value=""), chat_history
 
4
  from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
5
  from langchain.text_splitter import CharacterTextSplitter
6
  from langchain.vectorstores import Chroma
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
  from transformers import pipeline
9
  import gradio as gr
10
 
11
+ # Workaround for sqlite in HuggingFace Spaces & environments without sqlite3
12
  __import__('pysqlite3')
13
  sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
14
 
15
+ # πŸ“„ Load documents from multiple_docs folder
16
  docs = []
17
  for f in os.listdir("multiple_docs"):
18
  if f.endswith(".pdf"):
 
25
  loader = TextLoader(os.path.join("multiple_docs", f))
26
  docs.extend(loader.load())
27
 
28
+ # πŸ”— Split into smaller chunks
29
  splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
30
  docs = splitter.split_documents(docs)
31
 
32
+ # 🧠 Prepare texts and metadata
 
33
  texts = [doc.page_content for doc in docs]
34
  metadatas = [{"id": i} for i in range(len(texts))]
 
35
 
36
+ # 🧬 Embeddings
37
+ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
38
+
39
+ # πŸ—ƒοΈ Vectorstore
40
+ vectorstore = Chroma(
41
+ persist_directory="./db",
42
+ embedding_function=embedding_function
43
+ )
44
+ vectorstore.add_texts(texts=texts, metadatas=metadatas)
45
  vectorstore.persist()
46
 
47
+ # πŸ€– Load free LLM using pipeline
48
+ model_name = "google/flan-t5-large" # or flan-t5-base if you prefer faster
49
  generator = pipeline("text2text-generation", model=model_name, device=-1) # -1 β†’ CPU
50
 
51
+ # πŸ”— Wrap pipeline in a callable for LangChain
52
  class HuggingFaceLLMWrapper:
53
  def __init__(self, generator):
54
  self.generator = generator
 
59
 
60
  llm = HuggingFaceLLMWrapper(generator)
61
 
62
+ # πŸ”— Create Conversational QA chain
63
  chain = ConversationalRetrievalChain.from_llm(
64
  llm,
65
  retriever=vectorstore.as_retriever(search_kwargs={'k': 6}),
 
67
  verbose=False
68
  )
69
 
70
+ # πŸ’¬ Gradio interface
71
  chat_history = []
72
 
73
  with gr.Blocks() as demo:
 
75
  [("", "Hello, I'm Thierry Decae's chatbot. Ask me about my experience, skills, eligibility, etc.")],
76
  avatar_images=["./multiple_docs/Guest.jpg", "./multiple_docs/Thierry Picture.jpg"]
77
  )
78
+ msg = gr.Textbox(placeholder="Type your question here...")
79
  clear = gr.Button("Clear")
80
 
81
  def user(query, chat_history):
 
82
  chat_history_tuples = [(m[0], m[1]) for m in chat_history]
 
83
  result = chain({"question": query, "chat_history": chat_history_tuples})
84
  chat_history.append((query, result["answer"]))
85
  return gr.update(value=""), chat_history