faisalsns commited on
Commit
70344e5
·
1 Parent(s): 54cbbae

Initial commit for app

Browse files
Files changed (4) hide show
  1. Dockerfile +12 -0
  2. docker-compose.yml +46 -0
  3. main.py +249 -0
  4. requirements.txt +24 -0
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+ FROM python:3.10-slim
3
+
4
+ WORKDIR /app
5
+
6
+ COPY requirements.txt ./
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ COPY . .
10
+
11
+ CMD ["python", "main.py"]
12
+
docker-compose.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+ services:
3
+ zookeeper:
4
+ image: bitnami/zookeeper:latest
5
+ container_name: zookeeper
6
+ ports:
7
+ - "2181:2181"
8
+ environment:
9
+ - ALLOW_ANONYMOUS_LOGIN=yes
10
+ volumes:
11
+ - zookeeper_data:/bitnami
12
+
13
+ kafka:
14
+ image: bitnami/kafka:3.4
15
+ container_name: kafka
16
+ ports:
17
+ - "9094:9094" # external
18
+ - "9092:9092" # internal
19
+ environment:
20
+ - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
21
+ - KAFKA_CFG_LISTENERS=INTERNAL://0.0.0.0:9092,EXTERNAL://0.0.0.0:9094
22
+ - KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL://kafka:9092,EXTERNAL://localhost:9094
23
+ - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT
24
+ - KAFKA_INTER_BROKER_LISTENER_NAME=INTERNAL
25
+ - ALLOW_PLAINTEXT_LISTENER=yes
26
+ - KAFKA_CFG_BROKER_ID=1
27
+ depends_on:
28
+ - zookeeper
29
+ volumes:
30
+ - kafka_data:/bitnami
31
+
32
+ app:
33
+ build: .
34
+ ports:
35
+ - "7860:7860"
36
+ - "5678:5678"
37
+ depends_on:
38
+ - kafka
39
+ environment:
40
+ - PYTHONUNBUFFERED=1
41
+ volumes:
42
+ - .:/app
43
+
44
+ volumes:
45
+ zookeeper_data:
46
+ kafka_data:
main.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import gradio as gr
3
+ import wikipedia
4
+ import json
5
+ import uuid
6
+ from kafka import KafkaProducer, KafkaConsumer
7
+ from sentence_transformers import SentenceTransformer
8
+ import faiss
9
+ import numpy as np
10
+ from langchain.vectorstores import FAISS
11
+ from langchain.embeddings import HuggingFaceEmbeddings
12
+ from langchain.chains import RetrievalQA
13
+ from langchain.llms import HuggingFaceHub
14
+
15
+
16
+ from wikipedia.exceptions import DisambiguationError, PageError
17
+
18
+ import os
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv()
22
+
23
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
24
+
25
+ # Globals
26
+ TOPICS = ["Art", "Science", "Technology", "Movies", "Sports", "Politics"]
27
+ producer = KafkaProducer(bootstrap_servers='kafka:9092', value_serializer=lambda v: json.dumps(v).encode('utf-8'))
28
+ consumer = KafkaConsumer('wiki-topic', bootstrap_servers='kafka:9092', value_deserializer=lambda m: json.loads(m.decode('utf-8')))
29
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
30
+ index = faiss.IndexFlatL2(384)
31
+ faiss_store = []
32
+ metadatas = []
33
+
34
+ # Agent 1 - Scrape Wikipedia and produce to Kafka
35
+ import logging
36
+
37
+ logging.basicConfig(level=logging.INFO)
38
+
39
+
40
+ def agent1_scrape_and_publish(selected_topics, count):
41
+ count = min(count, 20)
42
+ results = []
43
+
44
+ # Topic mappings for your specific topics to handle disambiguation
45
+ topic_mappings = {
46
+ 'Arts': ['Art', 'The arts', 'Visual arts'],
47
+ 'Science': ['Science', 'Natural science', 'Scientific method'],
48
+ 'Sports': ['Sport', 'Sports', 'Athletic sports'],
49
+ 'Movies': ['Film', 'Cinema', 'Movie'],
50
+ 'Technology': ['Technology', 'Information technology', 'Modern technology']
51
+ }
52
+
53
+ def get_wikipedia_page(topic):
54
+ """Get Wikipedia page with disambiguation handling for your specific topics"""
55
+
56
+ # Try the exact topic first
57
+ try:
58
+ return wikipedia.page(topic)
59
+ except DisambiguationError as e:
60
+ logging.info(f"Disambiguation found for '{topic}'. Options: {e.options[:3]}")
61
+
62
+ # Try mapped alternatives first
63
+ if topic in topic_mappings:
64
+ for alternative in topic_mappings[topic]:
65
+ try:
66
+ page = wikipedia.page(alternative)
67
+ logging.info(f"Successfully resolved '{topic}' to '{page.title}'")
68
+ return page
69
+ except (DisambiguationError, PageError):
70
+ continue
71
+
72
+ # Try the first few disambiguation options
73
+ for option in e.options[:3]:
74
+ try:
75
+ page = wikipedia.page(option)
76
+ logging.info(f"Used disambiguation option '{option}' for '{topic}'")
77
+ return page
78
+ except (DisambiguationError, PageError):
79
+ continue
80
+
81
+ raise Exception(f"Could not resolve disambiguation for '{topic}'")
82
+
83
+ except PageError as e:
84
+ logging.warning(f"Page not found for '{topic}', trying search...")
85
+
86
+ # Try searching for alternatives
87
+ search_results = wikipedia.search(topic, results=3)
88
+ if search_results:
89
+ for result in search_results:
90
+ try:
91
+ page = wikipedia.page(result)
92
+ logging.info(f"Found alternative '{result}' for '{topic}'")
93
+ return page
94
+ except Exception:
95
+ continue
96
+
97
+ raise Exception(f"No suitable page found for '{topic}'")
98
+
99
+ for topic in selected_topics[:count]:
100
+ try:
101
+ logging.info(f"Processing topic: '{topic}'")
102
+ page = get_wikipedia_page(topic)
103
+
104
+ content = {
105
+ 'id': str(uuid.uuid4()),
106
+ 'title': page.title,
107
+ 'content': page.content[:3000],
108
+ 'original_topic': topic, # Keep track of original request
109
+ 'url': page.url,
110
+ 'summary': page.summary[:500] # Add summary for better context
111
+ }
112
+
113
+ # Send to Kafka
114
+ future = producer.send('wiki-topic', content)
115
+ record_metadata = future.get(timeout=10)
116
+
117
+ logging.info(f"Successfully published: '{topic}' -> '{page.title}' to {record_metadata.topic}:{record_metadata.partition}")
118
+ #results.append(f"{topic} -> {page.title}")
119
+ results.append(f"{topic} -> {page.summary[:50]}...") # Add summary snippet
120
+
121
+ except Exception as e:
122
+ error_msg = f"Failed to process topic '{topic}': {e}"
123
+ logging.error(error_msg)
124
+ results.append(f"{topic} -> ERROR: {str(e)}")
125
+ # Continue with other topics instead of stopping
126
+
127
+ producer.flush()
128
+ return "\n".join(results)
129
+
130
+ # Agent 2 - Consume, format, vectorize, store
131
+
132
+ def agent2_consume_and_index():
133
+ for message in consumer:
134
+ data = message.value
135
+ title = data['title'][:100]
136
+ content = data['content']
137
+ intro = content[:200]
138
+ desc = content[200:500]
139
+ combined = f"{title}. {intro}. {desc}"
140
+ emb = embedding_model.encode([combined])
141
+ index.add(np.array(emb))
142
+ faiss_store.append(combined)
143
+ metadatas.append({"title": title})
144
+ if len(faiss_store) >= 20:
145
+ break
146
+
147
+
148
+ from transformers import pipeline
149
+ from langchain.embeddings import HuggingFaceEmbeddings
150
+ from langchain.vectorstores import FAISS
151
+
152
+ def ask_question(question, min_confidence=0.3):
153
+ if not faiss_store:
154
+ return "Index empty. Run Agent 1 and 2 first."
155
+
156
+ try:
157
+ # Initialize embeddings
158
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
159
+
160
+ # Create FAISS database from stored texts
161
+ db = FAISS.from_texts(faiss_store, embeddings, metadatas=metadatas)
162
+
163
+ # Perform similarity search - increase k for more context
164
+ relevant_docs = db.similarity_search_with_score(question, k=5)
165
+
166
+ # Filter by similarity score (lower is better for FAISS cosine distance)
167
+ filtered_docs = [doc for doc, score in relevant_docs if score < 0.8] # Adjust threshold
168
+
169
+ if not filtered_docs:
170
+ return "No relevant information found in the knowledge base."
171
+
172
+ # Create comprehensive context
173
+ context = "\n\n".join([doc.page_content for doc in filtered_docs])
174
+
175
+ # Method 1: Use a generative model (RECOMMENDED)
176
+ from transformers import pipeline, AutoTokenizer
177
+
178
+ # Use a better model for text generation
179
+ generator = pipeline(
180
+ "text2text-generation",
181
+ model="google/flan-t5-base", # Better for Q&A tasks
182
+ tokenizer="google/flan-t5-base"
183
+ )
184
+
185
+ # Create a better prompt
186
+ prompt = f"""Answer the following question based on the provided context. If the context doesn't contain enough information, say so.
187
+
188
+ Context: {context[:2000]} # Truncate to avoid token limits
189
+
190
+ Question: {question}
191
+
192
+ Answer:"""
193
+
194
+ response = generator(
195
+ prompt,
196
+ max_length=200,
197
+ min_length=20,
198
+ temperature=0.7,
199
+ do_sample=True
200
+ )
201
+
202
+ return response[0]['generated_text'].strip()
203
+
204
+ except Exception as e:
205
+ return f"Error processing question: {str(e)}"
206
+
207
+ hf_links = [
208
+ ("AI Reasoning Copilot", "https://huggingface.co/spaces/faisalsns/ai-reasoning-copilot"),
209
+ ("Language Detection Compare Models", "https://huggingface.co/spaces/faisalsns/language-detection-compare-models/"),
210
+ ("Prompt Playground v1 - Compare Models Output", "https://huggingface.co/spaces/faisalsns/prompt-canvas-engine"),
211
+ ("Mental Disorders Symptoms", "https://huggingface.co/spaces/faisalsns/mental-disorders-symptoms")
212
+ ]
213
+
214
+ def get_links():
215
+ otherlinks = "<br>".join([f"[{name}]({url})" for name, url in hf_links])
216
+ return f"### Other Applications To Explore!\n{otherlinks}"
217
+
218
+ # Gradio UI
219
+ with gr.Blocks() as demo:
220
+ gr.Markdown("## AI Copilot for Wikipedia")
221
+ with gr.Row():
222
+ with gr.Column(scale=1):
223
+ topic_list = gr.CheckboxGroup(choices=TOPICS, label="Select Topics")
224
+ topic_count = gr.Slider(1, 20, step=1, label="Number of Topics to Scrape")
225
+ scrape_btn = gr.Button("Generate from Wikipedia")
226
+ output_titles = gr.Textbox(label="Article Titles", lines=6)
227
+
228
+ with gr.Column(scale=2):
229
+ question_box = gr.Textbox(label="Ask a question")
230
+ submit_btn = gr.Button("Submit")
231
+ answer_box = gr.Textbox(label="Answer")
232
+ gr.Markdown(
233
+ """
234
+ **A Note -**
235
+ I am on a self directed AI journey and for this project I am building an AI Copilot that scrapes content from Wikipedia for user selected category of articles, index them and then answer questions based on that content.
236
+ I always wanted to use Kafka for messages so using that in this project. Additionally I am using FAISS storage since I have already tried ChromaDB earlier. Lastly using HuggingFace models for embeddings and answering questions.
237
+
238
+ There's no better way to learn than build it yourself 🚀
239
+ """
240
+ )
241
+ gr.Markdown(get_links())
242
+
243
+ scrape_btn.click(fn=agent1_scrape_and_publish, inputs=[topic_list, topic_count], outputs=output_titles)
244
+ submit_btn.click(fn=ask_question, inputs=question_box, outputs=answer_box)
245
+
246
+ if __name__ == "__main__":
247
+ import threading
248
+ threading.Thread(target=agent2_consume_and_index, daemon=True).start()
249
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+ # gradio
3
+ # wikipedia
4
+ # kafka-python
5
+ # sentence-transformers
6
+ # faiss-cpu
7
+ # langchain
8
+ # huggingface-hub
9
+ # numpy
10
+
11
+
12
+ gradio<4.0.0
13
+ wikipedia
14
+ kafka-python
15
+ sentence-transformers==2.2.2
16
+ faiss-cpu
17
+ langchain==0.1.14
18
+ huggingface-hub==0.19.3
19
+ numpy
20
+ pydantic<2.0.0
21
+ debugpy
22
+ python-dotenv
23
+ huggingface_hub
24
+ langchain-community>=0.0.30,<0.1