Anvit25 commited on
Commit
24510a0
·
1 Parent(s): f92b487

Initial commit of Gradio chatbot

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: LLM Chatbot2
3
- emoji: 🔥
4
- colorFrom: gray
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.47.2
8
  app_file: app.py
 
1
  ---
2
+ title: LLM
3
+ emoji: 📈
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.47.2
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from langchain_community.document_loaders import TextLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import Chroma
7
+ from langchain_community.llms import HuggingFacePipeline
8
+ from langchain.chains import ConversationalRetrievalChain
9
+ from langchain.memory import ConversationBufferMemory
10
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
11
+
12
+ # -------------------------------------------------------------------
13
+ # Constants
14
+ DB_DIR = "chroma_db"
15
+ MODEL_NAME_EMBEDDINGS = "sentence-transformers/all-MiniLM-L6-v2"
16
+ MODEL_ID_LLM = "google/flan-t5-base"
17
+ DOC_PATH = "temp_docs/samsung_manual.txt" # fixed document path
18
+
19
+ # Globals
20
+ conversation_chain = None
21
+ chat_history = [] # [{"role": "user/assistant", "content": "..."}]
22
+
23
+ # -------------------------------------------------------------------
24
+ def load_and_process_document():
25
+ """Load the Samsung manual, split it, embed it, and create vectorstore."""
26
+ if not os.path.exists(DOC_PATH):
27
+ raise FileNotFoundError(f"❌ Document not found at: {DOC_PATH}")
28
+
29
+ print("📄 Loading document...")
30
+ # Force UTF-8 encoding to handle special characters
31
+ loader = TextLoader(DOC_PATH, encoding="utf-8")
32
+ docs = loader.load()
33
+
34
+ print("✂️ Splitting document into chunks...")
35
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
36
+ texts = text_splitter.split_documents(docs)
37
+
38
+ print("🧠 Creating embeddings...")
39
+ embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME_EMBEDDINGS)
40
+
41
+ print("💾 Building Chroma vectorstore...")
42
+ vectorstore = Chroma.from_documents(
43
+ documents=texts,
44
+ embedding=embeddings,
45
+ persist_directory=DB_DIR
46
+ )
47
+ return vectorstore, len(texts) # return number of chunks
48
+
49
+
50
+ def get_conversational_chain(vectorstore):
51
+ """Create the conversational retrieval chain."""
52
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID_LLM)
53
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID_LLM)
54
+ pipe = pipeline(
55
+ "text2text-generation",
56
+ model=model,
57
+ tokenizer=tokenizer,
58
+ max_length=512,
59
+ temperature=0.1,
60
+ top_p=0.95,
61
+ repetition_penalty=1.2
62
+ )
63
+ llm = HuggingFacePipeline(pipeline=pipe)
64
+
65
+ memory = ConversationBufferMemory(
66
+ memory_key="chat_history",
67
+ return_messages=True
68
+ )
69
+
70
+ chain = ConversationalRetrievalChain.from_llm(
71
+ llm=llm,
72
+ retriever=vectorstore.as_retriever(search_kwargs={"k": 2}),
73
+ memory=memory
74
+ )
75
+ return chain
76
+
77
+ def chatbot_response(user_input):
78
+ """Generate chatbot response from conversation chain."""
79
+ global conversation_chain, chat_history
80
+
81
+ if conversation_chain is None:
82
+ chat_history.append({
83
+ "role": "assistant",
84
+ "content": "⚠️ The chatbot is not ready. Please check the server logs."
85
+ })
86
+ return chat_history
87
+
88
+ chat_history.append({"role": "user", "content": user_input})
89
+ response = conversation_chain({"question": user_input})
90
+ ai_answer = response["answer"]
91
+ chat_history.append({"role": "assistant", "content": ai_answer})
92
+ return chat_history
93
+
94
+ # -------------------------------------------------------------------
95
+ # Gradio Interface
96
+ with gr.Blocks() as demo:
97
+ gr.Markdown("## 🤖 Chat with Samsung Manual")
98
+ gr.Markdown("Ask questions about the **Samsung Manual** document below:")
99
+
100
+ # Status info
101
+ status_md = gr.Markdown("⏳ Initializing chatbot...")
102
+
103
+ # Chat interface
104
+ chatbot = gr.Chatbot(label="Conversation", type="messages")
105
+ user_input = gr.Textbox(
106
+ label="Type your question here…",
107
+ placeholder="Ask me about the Samsung manual..."
108
+ )
109
+ submit_btn = gr.Button("Ask")
110
+
111
+ # -------------------------------------------------------------------
112
+ # Initialization function to show status
113
+ def init_chatbot():
114
+ global conversation_chain
115
+ try:
116
+ if not os.path.exists(DB_DIR) or not os.listdir(DB_DIR):
117
+ # Rebuild vectorstore
118
+ vectorstore, chunks = load_and_process_document()
119
+ msg = f"✅ Manual processed and stored! Total chunks: {chunks}"
120
+ else:
121
+ embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME_EMBEDDINGS)
122
+ vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
123
+ chunks = len(vectorstore._collection.get()["metadatas"])
124
+ msg = f"✅ Chroma DB loaded! Total chunks: {chunks}"
125
+
126
+ conversation_chain = get_conversational_chain(vectorstore)
127
+ return msg
128
+ except Exception as e:
129
+ import traceback
130
+ traceback.print_exc()
131
+ return f"❌ Failed to initialize chatbot: {e}"
132
+
133
+ # Initialize on startup
134
+ status_md.value = init_chatbot()
135
+
136
+ submit_btn.click(
137
+ fn=chatbot_response,
138
+ inputs=user_input,
139
+ outputs=chatbot
140
+ )
141
+
142
+ # -------------------------------------------------------------------
143
+ if __name__ == "__main__":
144
+ demo.launch()
chroma_db/54b1f93e-ee6f-4cfc-851a-34051bcd606f/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c15b835015ce916740eb6cd0034bcb75aa168e15c1530dc85ef2037dbe86ea63
3
+ size 167600
chroma_db/54b1f93e-ee6f-4cfc-851a-34051bcd606f/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
+ size 100
chroma_db/54b1f93e-ee6f-4cfc-851a-34051bcd606f/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e6c8a0a797078bde9f13737953401d49c57173cc93c3ad2809dfb504655055b
3
+ size 400
chroma_db/54b1f93e-ee6f-4cfc-851a-34051bcd606f/link_lists.bin ADDED
File without changes
chroma_db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b80a781782db463beb86ef421e2f11119b72f36cb4270a915f1ca203faa0552
3
+ size 1990656
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pypdf
2
+ gradio
3
+ langchain
4
+ chromadb
5
+ sentence-transformers
6
+ transformers
7
+ torch
temp_docs/samsung_manual.txt ADDED
The diff for this file is too large to render. See raw diff