Rahaf2001 commited on
Commit
6d0080e
·
verified ·
1 Parent(s): 29879ca

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ import faiss
7
+ from typing import List, Tuple
8
+ import re
9
+
10
+ model = SentenceTransformer('all-MiniLM-L6-v2')
11
+
12
+ doc_chunks = []
13
+ doc_embeddings = None
14
+ index = None
15
+ source_url = ""
16
+
17
+ def fetch_documentation(url: str) -> str:
18
+ try:
19
+ headers = {
20
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
21
+ }
22
+ response = requests.get(url, headers=headers, timeout=10)
23
+ response.raise_for_status()
24
+
25
+ soup = BeautifulSoup(response.content, 'html.parser')
26
+
27
+ for script in soup(["script", "style", "nav", "footer", "header"]):
28
+ script.decompose()
29
+
30
+ text = soup.get_text()
31
+
32
+ lines = (line.strip() for line in text.splitlines())
33
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
34
+ text = '\n'.join(chunk for chunk in chunks if chunk)
35
+
36
+ return text
37
+ except Exception as e:
38
+ raise Exception(f"Error fetching URL: {str(e)}")
39
+
40
+ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
41
+ sentences = re.split(r'[.!?]+', text)
42
+ chunks = []
43
+ current_chunk = ""
44
+
45
+ for sentence in sentences:
46
+ sentence = sentence.strip()
47
+ if not sentence:
48
+ continue
49
+
50
+ if len(current_chunk) + len(sentence) < chunk_size:
51
+ current_chunk += sentence + ". "
52
+ else:
53
+ if current_chunk:
54
+ chunks.append(current_chunk.strip())
55
+ current_chunk = sentence + ". "
56
+
57
+ if current_chunk:
58
+ chunks.append(current_chunk.strip())
59
+
60
+ return chunks
61
+
62
+ def process_documentation(url: str) -> str:
63
+ global doc_chunks, doc_embeddings, index, source_url
64
+
65
+ if not url:
66
+ return "Please provide a URL"
67
+
68
+ try:
69
+ status = "Fetching documentation..."
70
+ print(status)
71
+
72
+ text = fetch_documentation(url)
73
+
74
+ if len(text) < 100:
75
+ return "Retrieved content is too short. Please check the URL."
76
+
77
+ status = "Chunking text..."
78
+ print(status)
79
+
80
+ doc_chunks = chunk_text(text)
81
+
82
+ if not doc_chunks:
83
+ return "No content chunks created. The documentation might be empty."
84
+
85
+ status = f"Creating embeddings for {len(doc_chunks)} chunks..."
86
+ print(status)
87
+
88
+ doc_embeddings = model.encode(doc_chunks, show_progress_bar=False)
89
+
90
+ dimension = doc_embeddings.shape[1]
91
+ index = faiss.IndexFlatL2(dimension)
92
+ index.add(doc_embeddings.astype('float32'))
93
+
94
+ source_url = url
95
+
96
+ return f"Documentation processed successfully!\n\nStatistics:\n- Chunks created: {len(doc_chunks)}\n- Text length: {len(text)} characters\n- Ready to answer questions!"
97
+
98
+ except Exception as e:
99
+ return f"Error: {str(e)}"
100
+
101
+ def answer_question(question: str, top_k: int = 3) -> Tuple[str, str]:
102
+ global doc_chunks, doc_embeddings, index, source_url
103
+
104
+ if not question:
105
+ return "Please enter a question", ""
106
+
107
+ if index is None or not doc_chunks:
108
+ return "Please process documentation first by entering a URL above", ""
109
+
110
+ try:
111
+ question_embedding = model.encode([question])
112
+
113
+ distances, indices = index.search(question_embedding.astype('float32'), top_k)
114
+
115
+ relevant_chunks = [doc_chunks[i] for i in indices[0]]
116
+
117
+ context = "\n\n".join([f"[{i+1}] {chunk}" for i, chunk in enumerate(relevant_chunks)])
118
+
119
+ answer = f"Based on the documentation at {source_url}:\n\n"
120
+ answer += f"Relevant Information:\n\n{relevant_chunks[0]}"
121
+
122
+ if len(relevant_chunks) > 1:
123
+ answer += f"\n\nAdditional Context:\n\n{relevant_chunks[1]}"
124
+
125
+ sources = "Retrieved Chunks:\n\n"
126
+ for i, (chunk, dist) in enumerate(zip(relevant_chunks, distances[0])):
127
+ sources += f"Chunk {i+1} (similarity: {1/(1+dist):.3f}):\n{chunk}\n\n---\n\n"
128
+
129
+ return answer, sources
130
+
131
+ except Exception as e:
132
+ return f"Error: {str(e)}", ""
133
+
134
+ with gr.Blocks(theme=gr.themes.Soft(), title="Documentation RAG System") as demo:
135
+ gr.Markdown("# Documentation RAG System\n\nEnter a documentation URL, process it, then ask questions about the content using AI-powered retrieval.")
136
+
137
+ with gr.Row():
138
+ with gr.Column():
139
+ url_input = gr.Textbox(
140
+ label="Documentation URL",
141
+ placeholder="https://docs.python.org/3/tutorial/index.html",
142
+ lines=1
143
+ )
144
+ process_btn = gr.Button("Process Documentation", variant="primary")
145
+ status_output = gr.Textbox(
146
+ label="Status",
147
+ lines=6,
148
+ interactive=False
149
+ )
150
+
151
+ gr.Markdown("---")
152
+
153
+ with gr.Row():
154
+ with gr.Column():
155
+ question_input = gr.Textbox(
156
+ label="Your Question",
157
+ placeholder="What is this documentation about?",
158
+ lines=3
159
+ )
160
+
161
+ top_k_slider = gr.Slider(
162
+ minimum=1,
163
+ maximum=5,
164
+ value=3,
165
+ step=1,
166
+ label="Number of chunks to retrieve"
167
+ )
168
+
169
+ ask_btn = gr.Button("Ask Question", variant="primary")
170
+
171
+ with gr.Row():
172
+ with gr.Column():
173
+ answer_output = gr.Textbox(
174
+ label="Answer",
175
+ lines=10,
176
+ interactive=False
177
+ )
178
+
179
+ with gr.Column():
180
+ sources_output = gr.Textbox(
181
+ label="Source Chunks",
182
+ lines=10,
183
+ interactive=False
184
+ )
185
+
186
+ gr.Markdown("### Example URLs to try:")
187
+ gr.Examples(
188
+ examples=[
189
+ ["https://docs.python.org/3/tutorial/introduction.html"],
190
+ ["https://pytorch.org/docs/stable/torch.html"],
191
+ ["https://huggingface.co/docs/transformers/quicktour"],
192
+ ],
193
+ inputs=url_input
194
+ )
195
+
196
+ process_btn.click(
197
+ fn=process_documentation,
198
+ inputs=[url_input],
199
+ outputs=[status_output]
200
+ )
201
+
202
+ ask_btn.click(
203
+ fn=answer_question,
204
+ inputs=[question_input, top_k_slider],
205
+ outputs=[answer_output, sources_output]
206
+ )
207
+
208
+ question_input.submit(
209
+ fn=answer_question,
210
+ inputs=[question_input, top_k_slider],
211
+ outputs=[answer_output, sources_output]
212
+ )
213
+
214
+ if __name__ == "__main__":
215
+ demo.launch()