Michtiii commited on
Commit
6fd54c1
·
verified ·
1 Parent(s): 0485438

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -184
app.py CHANGED
@@ -1,194 +1,68 @@
 
1
  import os
2
- import faiss
3
- import numpy as np
4
- import gradio as gr
5
-
6
- from sentence_transformers import SentenceTransformer
7
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
  from PyPDF2 import PdfReader
 
 
 
 
 
 
9
 
10
- # -----------------------------
11
- # CONFIG
12
- # -----------------------------
13
- DATA_PATH = "Docs"
14
- TOP_K = 3
15
- LLM_MODEL = "google/flan-t5-base"
16
-
17
- # -----------------------------
18
- # LOAD MODELS
19
- # -----------------------------
20
- embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
21
-
22
- tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
23
- llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL)
24
-
25
- # -----------------------------
26
- # FILE LOADER
27
- # -----------------------------
28
- def read_file(path):
29
- if path.endswith(".txt") or path.endswith(".md"):
30
- with open(path, "r", encoding="utf-8") as f:
31
- return f.read()
32
- elif path.endswith(".pdf"):
33
- reader = PdfReader(path)
34
- text = ""
35
- for page in reader.pages:
36
- text += page.extract_text() or ""
37
- return text
38
- return ""
39
 
40
- def load_docs(folder):
 
41
  texts = []
42
- if not os.path.exists(folder):
43
- return texts
44
-
45
  for file in os.listdir(folder):
46
- path = os.path.join(folder, file)
47
- try:
48
- txt = read_file(path)
49
- if txt.strip():
50
- texts.append(txt)
51
- except:
52
- continue
53
  return texts
54
 
55
- # -----------------------------
56
- # CHUNKING
57
- # -----------------------------
58
- def chunk_text(text, size=300, overlap=50):
59
- words = text.split()
60
- return [" ".join(words[i:i + size]) for i in range(0, len(words), size - overlap)]
61
-
62
- # -----------------------------
63
- # BUILD VECTOR DB
64
- # -----------------------------
65
- def build_index(docs):
66
- chunks = []
67
- for doc in docs:
68
- chunks.extend(chunk_text(doc))
69
-
70
- if not chunks:
71
- return None, []
72
-
73
- embeddings = embedding_model.encode(chunks)
74
- dim = embeddings.shape[1]
75
-
76
- index = faiss.IndexFlatL2(dim)
77
- index.add(np.array(embeddings))
78
-
79
- return index, chunks
80
-
81
- # -----------------------------
82
- # RETRIEVE
83
- # -----------------------------
84
- def retrieve(query, index, chunks, k=TOP_K):
85
- q_embed = embedding_model.encode([query])
86
- D, I = index.search(np.array(q_embed), k)
87
- return [chunks[i] for i in I[0]]
88
-
89
- # -----------------------------
90
- # GENERATE ANSWER (WITH MEMORY)
91
- # -----------------------------
92
- def generate_answer(query, contexts, history):
93
- context = "\n\n".join(contexts)
94
-
95
- history_text = ""
96
- for h in history[-6:]:
97
- history_text += f"{h['role']}: {h['content']}\n"
98
-
99
- prompt = f"""
100
- You are an expert AI/ML assistant.
101
-
102
- Conversation:
103
- {history_text}
104
-
105
- Context:
106
- {context}
107
-
108
- Question:
109
- {query}
110
-
111
- Answer clearly:
112
- """
113
 
114
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
115
- outputs = llm_model.generate(**inputs, max_new_tokens=200)
116
-
117
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
118
-
119
- # -----------------------------
120
- # TOOL RECOMMENDER (AGENT MODE)
121
- # -----------------------------
122
- def tool_recommender(query):
123
- prompt = f"""
124
- You are an AI architect.
125
-
126
- Suggest best AI/ML tools for:
127
-
128
- {query}
129
-
130
- Give:
131
- - Tools
132
- - Why
133
- - Use cases
134
- """
135
-
136
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
137
- outputs = llm_model.generate(**inputs, max_new_tokens=150)
138
-
139
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
140
-
141
- # -----------------------------
142
- # INIT
143
- # -----------------------------
144
- docs = load_docs(DATA_PATH)
145
- index, chunks = build_index(docs)
146
-
147
- # -----------------------------
148
- # MAIN CHAT PIPELINE
149
- # -----------------------------
150
- def rag_chat(query, history):
151
- history = history or []
152
-
153
- if index is None:
154
- history.append({"role": "user", "content": query})
155
- history.append({"role": "assistant", "content": "No documents found"})
156
- return history
157
-
158
- retrieved = retrieve(query, index, chunks)
159
-
160
- # Agent decision
161
- if "recommend" in query.lower() or "best tool" in query.lower():
162
- answer = tool_recommender(query)
163
- else:
164
- answer = generate_answer(query, retrieved, history)
165
-
166
- history.append({"role": "user", "content": query})
167
- history.append({"role": "assistant", "content": answer})
168
-
169
- return history
170
-
171
- # -----------------------------
172
- # UI HANDLER
173
- # -----------------------------
174
- def respond(message, chat_history):
175
- chat_history = chat_history or []
176
- updated_history = rag_chat(message, chat_history)
177
- return "", updated_history
178
-
179
- # -----------------------------
180
- # UI (CHATGPT STYLE)
181
- # -----------------------------
182
  with gr.Blocks() as demo:
183
- gr.Markdown("## AI/ML Conversational RAG + Tool Recommender")
184
-
185
- chatbot = gr.Chatbot(type="messages")
186
- msg = gr.Textbox(placeholder="Ask about AI tools, ML, companies...")
187
-
188
- msg.submit(respond, [msg, chatbot], [msg, chatbot])
189
-
190
- # -----------------------------
191
- # RUN
192
- # -----------------------------
193
- if __name__ == "__main__":
194
- demo.launch()
 
 
1
+ # app.py
2
  import os
 
 
 
 
 
 
3
  from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chains import RetrievalQA
8
+ from langchain.llms import HuggingFaceHub
9
+ import gradio as gr
10
 
11
+ # --------- Configuration ---------
12
+ DOCS_FOLDER = "Docs" # Folder containing PDF files
13
+ HF_API_KEY = os.getenv("HF_TOKEN") # HuggingFace API token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # --------- Load and process PDFs ---------
16
+ def load_pdfs(folder):
17
  texts = []
 
 
 
18
  for file in os.listdir(folder):
19
+ if file.endswith(".pdf"):
20
+ pdf_path = os.path.join(folder, file)
21
+ reader = PdfReader(pdf_path)
22
+ for page in reader.pages:
23
+ texts.append(page.extract_text())
 
 
24
  return texts
25
 
26
+ raw_texts = load_pdfs(DOCS_FOLDER)
27
+
28
+ # Split into smaller chunks for embeddings
29
+ text_splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size=1000,
31
+ chunk_overlap=50
32
+ )
33
+ docs = text_splitter.split_text(" ".join(raw_texts))
34
+
35
+ # --------- Create embeddings and vectorstore ---------
36
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
37
+ vectorstore = FAISS.from_texts(docs, embedding=embeddings)
38
+
39
+ # --------- Setup LLM & Retrieval QA chain ---------
40
+ llm = HuggingFaceHub(
41
+ repo_id="google/flan-t5-large",
42
+ model_kwargs={"temperature": 0, "max_length": 512},
43
+ huggingfacehub_api_token=HF_API_KEY
44
+ )
45
+ qa_chain = RetrievalQA.from_chain_type(
46
+ llm=llm,
47
+ chain_type="stuff",
48
+ retriever=vectorstore.as_retriever()
49
+ )
50
+
51
+ # --------- Gradio interface ---------
52
+ def answer_query(query):
53
+ return qa_chain.run(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  with gr.Blocks() as demo:
56
+ gr.Markdown("## PDF Document RAG QA System")
57
+ chatbot = gr.Chatbot()
58
+ msg = gr.Textbox(label="Enter your question:")
59
+ submit = gr.Button("Ask")
60
+
61
+ def chat_fn(user_input, chat_history):
62
+ answer = answer_query(user_input)
63
+ chat_history.append((user_input, answer))
64
+ return chat_history, ""
65
+
66
+ submit.click(chat_fn, inputs=[msg, chatbot], outputs=[chatbot, msg])
67
+
68
+ demo.launch(server_name="0.0.0.0", server_port=7860)