Vivek Vaddina commited on
Commit
725c96a
·
unverified ·
1 Parent(s): f5b5f0d

✨ Working version of RAG with HYDE

Browse files
Files changed (6) hide show
  1. app.py +459 -9
  2. src/__init__.py +0 -0
  3. src/config.py +23 -0
  4. src/hyde_rag.py +206 -0
  5. src/main.py +504 -0
  6. src/prompts.yaml +307 -0
app.py CHANGED
@@ -1,18 +1,468 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(files, query):
4
- names = [file.name for file in files]
5
- return f"{len(files)}, {names}, {query}"
 
 
 
 
 
 
6
 
7
- with gr.Blocks() as demo:
8
- pdf_input = gr.File(label='upload PDF(s)', file_types=['.pdf'], file_count='multiple')
9
- query = gr.Textbox(label='question')
10
- answer = gr.Textbox(label='answer')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  btn = gr.Button('Submit')
13
- btn.click(greet, inputs=[pdf_input, query], outputs=answer)
 
 
 
14
 
15
 
16
  if __name__ == '__main__':
17
- demo.launch()
18
 
 
1
+ import re
2
+ import math
3
+ import yaml
4
+ import json
5
+ import torch
6
+ import faiss
7
+ import string
8
+ import asyncio
9
+ import pymupdf
10
  import gradio as gr
11
 
12
+ from time import time
13
+ from pathlib import Path
14
+ from functools import lru_cache
15
+ from ast import literal_eval
16
+ from collections import defaultdict
17
+ from concurrent.futures import ThreadPoolExecutor
18
+ from sentence_transformers import SentenceTransformer
19
+ from langchain.text_splitter import SentenceTransformersTokenTextSplitter
20
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
21
 
22
+ from src.config import PROMPTS_FILEPATH, log
23
+
24
+
25
+ async def load_pdfs(files, max_concurrence=5):
26
+ """
27
+ Load multiple PDF files async.
28
+
29
+ Args:
30
+ folder (str): Path to folder containing PDF files
31
+ max_concurrence (int): Maximum number of concurrent PDF processing tasks
32
+
33
+ Returns:
34
+ list: List of tuples containing (filename, extracted_text)
35
+ """
36
+
37
+ def _load_pdf_sync(file):
38
+ """Synchronous PDF loading function for thread pool execution"""
39
+ text = ""
40
+ try:
41
+ with pymupdf.open(file, filetype="pdf") as doc:
42
+ text = "\n".join(page.get_text() for page in doc)
43
+ except Exception:
44
+ log.exception(f"Error reading {file.name}")
45
+ pass
46
+
47
+ return (file.name, text)
48
+
49
+ loop = asyncio.get_event_loop()
50
+ with ThreadPoolExecutor(max_workers=max_concurrence) as executor:
51
+ futures = [
52
+ loop.run_in_executor(executor, _load_pdf_sync, file)
53
+ for file in files
54
+ if file is not None
55
+ ]
56
+
57
+ results = await asyncio.gather(*futures, return_exceptions=True)
58
+
59
+ valid_results = [result for result in results if not isinstance(result, Exception)]
60
+
61
+ log.info(f"successfully processed {len(valid_results)} out of {len(files)} PDFs ")
62
+ return valid_results
63
+
64
+
65
+ async def build_corpus(pdfs, text_splitter, **load_kwargs):
66
+ texts = await load_pdfs(pdfs, **load_kwargs)
67
+ corpus, meta = [], []
68
+ for file_name, raw_text in texts:
69
+ chunks = text_splitter.split_text(raw_text)
70
+ for i, chunk in enumerate(chunks):
71
+ corpus.append(chunk)
72
+ meta.append({"file": file_name, "chunk_id": i})
73
+ return corpus, meta
74
+
75
+
76
+ def generate_text(
77
+ tokenizer, model, user_prompts, system_prompt=None, **llm_kwargs
78
+ ): # max_new_tokens=512, temperature=.4):
79
+ if system_prompt is None or "":
80
+ system_prompt = "You are a helpful assistant."
81
+
82
+ if isinstance(user_prompts, str):
83
+ user_prompts = [user_prompts]
84
+
85
+ messages = [
86
+ [
87
+ {"role": "system", "content": system_prompt},
88
+ {"role": "user", "content": user_prompt},
89
+ ]
90
+ for user_prompt in user_prompts
91
+ ]
92
+
93
+ texts = tokenizer.apply_chat_template(
94
+ messages, tokenize=False, add_generation_prompt=True
95
+ )
96
+
97
+ model_inputs = tokenizer(
98
+ texts, return_tensors="pt", truncation=True, padding=True
99
+ ).to(model.device)
100
+ generated_ids = model.generate(
101
+ **model_inputs,
102
+ max_new_tokens=llm_kwargs.pop("max_new_tokens", 512),
103
+ temperature=llm_kwargs.pop("temperature", 0.4),
104
+ )
105
+ generated_ids = [
106
+ output_ids[len(input_ids) :]
107
+ for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
108
+ ]
109
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
110
+ return response if len(user_prompts) > 1 else response[0]
111
+
112
+
113
+ def load_models(
114
+ embed_model_name: str,
115
+ gen_model_name: str,
116
+ causal_lm: bool = False,
117
+ device=None,
118
+ bitsandbytesconfig=None,
119
+ ):
120
+ # This will take some time to run for the first time if the model(s) don't exist locally.
121
+ if not device:
122
+ device = "cuda" if torch.cuda.is_available() else "cpu"
123
+ embedder = SentenceTransformer(
124
+ embed_model_name,
125
+ device=device,
126
+ model_kwargs={"dtype": "float16"} if device == "cuda" else {},
127
+ )
128
+
129
+ if not causal_lm:
130
+ tok = AutoTokenizer.from_pretrained(gen_model_name)
131
+ gen = AutoModelForSeq2SeqLM.from_pretrained(
132
+ gen_model_name, # device_map='auto',
133
+ quantization_config=bitsandbytesconfig if bitsandbytesconfig else None,
134
+ )
135
+ else:
136
+ tok = AutoTokenizer.from_pretrained(gen_model_name, padding_side="left")
137
+ gen = AutoModelForCausalLM.from_pretrained(
138
+ gen_model_name,
139
+ dtype="float16", # device_map='auto',
140
+ quantization_config=bitsandbytesconfig if bitsandbytesconfig else None,
141
+ )
142
+ gen.to(device)
143
+ return embedder, tok, gen
144
+
145
+
146
+ def make_query_variants(
147
+ tokenizer, model, query: str, prompt: str, n: int = 3, **llm_kwargs
148
+ ):
149
+ instructions = f"Now give me at least {n} variations."
150
+ resp = generate_text(tokenizer, model, query + instructions, prompt, **llm_kwargs)
151
+
152
+ clean_resp = re.sub(r"^\d+\.\s*", "", resp, flags=re.MULTILINE).split("\n")
153
+ return [query] + [q for q in clean_resp if q.strip()]
154
+
155
+
156
+ def clean_rewrite_resp(resp):
157
+ try:
158
+ resp = json.loads(resp) # Parse JSON
159
+ except json.JSONDecodeError:
160
+ try:
161
+ resp = literal_eval(resp) # Fallback parse
162
+ except Exception:
163
+ pass # Keep resp as-is if both fail
164
+
165
+ # Ensure resp is a string before strip and slicing
166
+ if isinstance(resp, str):
167
+ resp = resp.strip()
168
+ if resp:
169
+ start = resp.find("{")
170
+ if start != -1:
171
+ end = resp[::-1].find("}")
172
+ if end != -1:
173
+ resp = resp[start : len(resp) - end]
174
+ return clean_rewrite_resp(resp)
175
+ return resp
176
+
177
+
178
+ def transform_query(
179
+ tokenizer, model, query: str, rewrite_prompt: str, **llm_kwargs
180
+ ) -> dict:
181
+ """split the query into things to search and actions to take"""
182
+ resp = generate_text(tokenizer, model, query, rewrite_prompt, **llm_kwargs)
183
+ try:
184
+ resp = clean_rewrite_resp(resp)
185
+ except:
186
+ pass
187
+ return resp
188
+
189
+
190
+ def aggregate_queries_and_tasks(
191
+ tokenizer,
192
+ model,
193
+ orig_query,
194
+ rewrite_prompt,
195
+ variants_prompt,
196
+ n_variations=3,
197
+ **llm_kwargs,
198
+ ):
199
+ # make variations for the original query as is
200
+ queries = make_query_variants(
201
+ tokenizer,
202
+ model,
203
+ orig_query.strip(),
204
+ variants_prompt,
205
+ n_variations,
206
+ **llm_kwargs,
207
+ )
208
+
209
+ start = time()
210
+ tr_q = transform_query(tokenizer, model, orig_query.strip(), rewrite_prompt)
211
+ end = time()
212
+ log.debug(f"\t\t transforming query task took {(end - start):.1f} seconds...")
213
+
214
+ # transformed query might have multiple things to search and tasks to perform depending on user query
215
+ # recursively get variations for each of the search queries but keep the tasks as is.
216
+ tasks = []
217
+ if isinstance(tr_q, dict):
218
+ search_results, tasks = tr_q.get("search", []), tr_q.get("tasks", [])
219
+ for search_result in search_results:
220
+ queries.extend(
221
+ make_query_variants(
222
+ tokenizer,
223
+ model,
224
+ search_result,
225
+ variants_prompt,
226
+ n_variations,
227
+ **llm_kwargs,
228
+ )
229
+ )
230
+
231
+ queries = [q.strip(string.punctuation) for q in queries]
232
+ tasks = [t.strip(string.punctuation) for t in tasks]
233
+
234
+ # keep the original user query as is (if in case LLM messes up the original query) and pick some after shuffling the rest
235
+ # This is disabled as we don't do loops and instead take advantage of batches.
236
+ # Since it's efficient, we can take many query variations at once without worrying about performance.
237
+ # q, queries = queries[:1], queries[1:]
238
+ # shuffle(queries)
239
+ # q += queries[:n_variations-1]
240
+
241
+ return queries, tasks
242
+
243
+
244
+ def build_index(corpus_emb, n_cells=5, n_probe=2):
245
+ log.debug(f"building index with {n_cells=}, {n_probe=}")
246
+ d = corpus_emb.shape[1]
247
+ quantizer = faiss.IndexFlatIP(d)
248
+ index = faiss.IndexIVFFlat(quantizer, d, n_cells)
249
+ index.n_probe = n_probe
250
+ index.train(corpus_emb)
251
+ index.add(corpus_emb)
252
+ # index.make_direct_map()
253
+ return index
254
+
255
+
256
+ def reciprocal_rank_fusion(indices, top_k=3, denom=50):
257
+ ii = indices.tolist()
258
+ scores = defaultdict(int)
259
+ for row in ii:
260
+ for rank, chunk_id in enumerate(row):
261
+ scores[chunk_id] += 1 / (rank + denom)
262
+ results = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
263
+ return [chunk_id for chunk_id, _ in results]
264
+
265
+
266
+ class HyDeRAGFusion:
267
+ def __init__(
268
+ self,
269
+ embed_model: str,
270
+ generator_llm_model: str,
271
+ causal_lm: bool = True,
272
+ chunk_overlap: int = 50,
273
+ tokens_per_chunk: int = 256,
274
+ embed_batch_size: int = 64,
275
+ bitsandbytesconfig=None,
276
+ ):
277
+ self.embed_batch_size = embed_batch_size
278
+ self.text_splitter = SentenceTransformersTokenTextSplitter(
279
+ chunk_overlap, embed_model, tokens_per_chunk
280
+ )
281
+ self.embedder, self.tok, self.gen = load_models(
282
+ embed_model, generator_llm_model, causal_lm, bitsandbytesconfig
283
+ )
284
+ with open(PROMPTS_FILEPATH) as fl:
285
+ self.prompts = yaml.safe_load(fl)
286
+
287
+ @lru_cache(maxsize=8)
288
+ def preprocess_pdfs(self, pdfs, data_load_kwargs={}, faiss_index_kwargs={}):
289
+ self.corpus, self.meta = asyncio.run(
290
+ build_corpus(pdfs, self.text_splitter, **data_load_kwargs)
291
+ )
292
+ log.debug(f"{len(self.corpus)}, {len(self.meta)}")
293
+ self.corpus_emb = self.embedder.encode(
294
+ self.corpus,
295
+ batch_size=self.embed_batch_size,
296
+ show_progress_bar=True,
297
+ normalize_embeddings=True,
298
+ )
299
+ log.debug(f'{self.corpus_emb.shape}')
300
+
301
+ # https://github.com/facebookresearch/faiss/issues/112
302
+ # n_cells = int(round(4 * (self.corpus_emb.shape[0])**.5))
303
+
304
+ # one centroid for every 100 or so vectors and 20% of them as n_probe
305
+ n_cells = faiss_index_kwargs.pop("n_cells", self.corpus_emb.shape[0] // 100 + 1)
306
+ n_probe = faiss_index_kwargs.pop("n_probe", math.ceil(0.2 * n_cells))
307
+
308
+ self.index = build_index(self.corpus_emb, n_cells, n_probe)
309
+
310
+ def retrieve(
311
+ self, query, n_variants=3, top_k_per_variant=10, top_k_retrieve=3, **llm_kwargs
312
+ ):
313
+ start = time()
314
+
315
+ queries, tasks = aggregate_queries_and_tasks(
316
+ self.tok,
317
+ self.gen,
318
+ query.strip(),
319
+ self.prompts["rewrite"],
320
+ self.prompts["variants"],
321
+ n_variants,
322
+ **llm_kwargs,
323
+ )
324
+
325
+ end = time()
326
+ log.debug(f"aggregate task took {(end - start):.1f} seconds...")
327
+
328
+ start = time()
329
+ hyde_docs = generate_text(
330
+ self.tok, self.gen, queries, self.prompts["hyde"], **llm_kwargs
331
+ )
332
+ end = time()
333
+ log.debug(f"generating hyde docs took {(end - start):.1f} seconds...")
334
+
335
+ start = time()
336
+ chunks = []
337
+ for hyde_doc in hyde_docs:
338
+ chunks.extend(self.text_splitter.split_text(hyde_doc))
339
+ q_emb = self.embedder.encode(
340
+ chunks, batch_size=self.embed_batch_size, normalize_embeddings=True
341
+ )
342
+ end = time()
343
+ log.debug(f"embedding hyde docs took {(end - start):.1f} seconds...")
344
+
345
+ _, I = self.index.search(q_emb, top_k_per_variant)
346
+ chunk_ids = reciprocal_rank_fusion(I, top_k_retrieve)
347
+ return chunk_ids, tasks
348
+
349
+ def answer(self, query, doc_ids, tasks, max_ctx_chars=128000):
350
+ total, text, prompt_length = 0, "", 10000
351
+ sep = "\n\n-----\n\n"
352
+ tasks = ", ".join(tasks)
353
+
354
+ for doc_id in doc_ids:
355
+ # adding tags in the context caused more hallucinations.
356
+ # Instead, we list them as sources beneath the model response.
357
+ # _meta = self.meta[doc_id]
358
+ # tag = f"(source: {_meta['file_name']}:{_meta['chunk_id']})"
359
+ chunk = self.corpus[doc_id].strip()
360
+ tag = ""
361
+
362
+ ctx = f"{sep}{tag}\n\n{chunk}"
363
+ if total + len(ctx) + len(tasks) + len(sep) + prompt_length > max_ctx_chars:
364
+ break
365
+
366
+ text += ctx
367
+ total = len(text)
368
+
369
+ text += f"{sep}{tasks}"
370
+
371
+ # instruction = "Answer concisely and also cite file names & chunk ids inline like (pdf_file_name:chunk_id)."
372
+ instruction = "go ahead and answer!"
373
+ user_query = f"\nq: {query}\n\nctx:{text}" + f"\n\n{instruction}\n\n"
374
+
375
+ start = time()
376
+ resp = generate_text(
377
+ self.tok,
378
+ self.gen,
379
+ user_query,
380
+ self.prompts["final_answer"],
381
+ temperature=0.3,
382
+ )
383
+ end = time()
384
+ log.debug(f"final resp took {(end - start):.1f} seconds...")
385
+
386
+ return resp
387
+
388
+
389
+ def initial_setup(embed_model, generator_model, bitsandbytesconfig=None):
390
+ return HyDeRAGFusion(
391
+ embed_model, generator_model, bitsandbytesconfig=bitsandbytesconfig
392
+ )
393
+
394
+ start = time()
395
+ HRF = initial_setup("sentence-transformers/LaBSE", "Qwen/Qwen2.5-0.5B-Instruct")
396
+ end = time()
397
+ msg = f"init took {(end - start):.1f} seconds"
398
+ log.debug(msg)
399
+
400
+ def main(
401
+ pdfs, query,
402
+ n_variants=3,
403
+ top_k_per_variant=5,
404
+ top_k_retrieve=3,
405
+ temperature=0.4,
406
+ max_new_tokens=512,
407
+ ):
408
+ start = time()
409
+ if pdfs:
410
+ HRF.preprocess_pdfs(tuple(sorted(pdfs)))
411
+
412
+ if query:
413
+ llm_kwargs = {
414
+ "temperature": temperature,
415
+ "max_new_tokens": max_new_tokens,
416
+ }
417
+ doc_ids, tasks = HRF.retrieve(
418
+ query,
419
+ int(n_variants),
420
+ int(top_k_per_variant),
421
+ int(top_k_retrieve),
422
+ **llm_kwargs,
423
+ )
424
+ docs = [HRF.corpus[doc_id] for doc_id in doc_ids]
425
+ reply = HRF.answer(query, doc_ids, tasks)
426
+ sources = [
427
+ {
428
+ "source": f"{Path(HRF.meta[doc_id]['file']).stem}:{HRF.meta[doc_id]['chunk_id']}",
429
+ "content": doc,
430
+ }
431
+ for doc_id, doc in zip(doc_ids, docs)
432
+ ]
433
+
434
+ resp = f"{reply}\n\n{'-'*25}\n\n"
435
+ resp += "Top 3 sources:"
436
+ resp += f"\n\n{'-'*25}\n\n"
437
+ for source in sources:
438
+ resp += f'source: {source["source"]}\n\n'
439
+ resp += source['content']
440
+ resp += f"\n\n{'-'*25}\n\n"
441
+
442
+ end = time()
443
+ log.debug(f"final resp took {(end - start):.1f} seconds")
444
+ return resp
445
+
446
+ def reset_text_on_file_change(pdfs):
447
+ """
448
+ Reset text input when input docs change
449
+ """
450
+ return ""
451
+
452
+
453
+ with gr.Blocks(title='RAG with HYDE') as demo:
454
+ gr.Markdown('# RAG with HYDE')
455
+ with gr.Row():
456
+ pdf_input = gr.File(label='upload PDF(s)', file_types=['.pdf'], file_count='multiple')
457
+ query = gr.Textbox(label='question')
458
 
459
  btn = gr.Button('Submit')
460
+ answer = gr.Markdown(label='### Answer')
461
+
462
+ btn.click(main, inputs=[pdf_input, query], outputs=answer)
463
+ pdf_input.change(reset_text_on_file_change, inputs=pdf_input, outputs=query)
464
 
465
 
466
  if __name__ == '__main__':
467
+ demo.launch(debug=True)
468
 
src/__init__.py ADDED
File without changes
src/config.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ PROMPTS_FILEPATH = Path("src/prompts.yaml")
5
+
6
+
7
+ def get_logger(LOG_LEVEL="INFO"):
8
+ LOG_PATH = Path("logs.log")
9
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
10
+
11
+ log = logging.Logger("hyde_rag")
12
+ log.setLevel(LOG_LEVEL)
13
+
14
+ file_handler = logging.FileHandler(LOG_PATH)
15
+ file_handler.setLevel(LOG_LEVEL)
16
+ file_handler.setFormatter(formatter)
17
+
18
+ log.addHandler(file_handler)
19
+
20
+ return log
21
+
22
+
23
+ log = get_logger("DEBUG")
src/hyde_rag.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # hyde_ragfusion.py
2
+ # Minimal HyDE + RAG-Fusion over local PDFs.
3
+ # Dependencies: transformers, sentence-transformers, scikit-learn, pymupdf, numpy
4
+
5
+ import os
6
+ import re
7
+ import heapq
8
+ import fitz # PyMuPDF
9
+ from sklearn.neighbors import NearestNeighbors
10
+ from sentence_transformers import SentenceTransformer
11
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
12
+
13
+
14
+ # -----------------------------
15
+ # Ingestion & Chunking
16
+ # -----------------------------
17
+ def load_pdfs(folder):
18
+ docs = []
19
+ for fn in os.listdir(folder):
20
+ if fn.lower().endswith(".pdf"):
21
+ path = os.path.join(folder, fn)
22
+ with fitz.open(path) as doc:
23
+ text = "\n".join(page.get_text("text") for page in doc)
24
+ text = re.sub(r"\s+\n", "\n", text).strip()
25
+ docs.append((fn, text))
26
+ return docs
27
+
28
+
29
+ def chunk_text(text, chunk_size=300, overlap=50):
30
+ words = text.split()
31
+ chunks, i = [], 0
32
+ while i < len(words):
33
+ chunk = " ".join(words[i : i + chunk_size])
34
+ chunks.append(chunk)
35
+ i += chunk_size - overlap
36
+ return chunks
37
+
38
+
39
+ def build_corpus(pdf_folder):
40
+ raw = load_pdfs(pdf_folder)
41
+ corpus, meta = [], []
42
+ for fn, txt in raw:
43
+ for i, ch in enumerate(chunk_text(txt)):
44
+ corpus.append(ch)
45
+ meta.append({"file": fn, "chunk_id": i})
46
+ return corpus, meta
47
+
48
+
49
+ # -----------------------------
50
+ # Models (local)
51
+ # -----------------------------
52
+ def load_models():
53
+ # Small, fast encoder for embeddings
54
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
55
+ # Lightweight local generator for HyDE + answers
56
+ gen_name = "google/flan-t5-base"
57
+ tok = AutoTokenizer.from_pretrained(gen_name)
58
+ gen = AutoModelForSeq2SeqLM.from_pretrained(gen_name)
59
+ return embedder, tok, gen
60
+
61
+
62
+ # -----------------------------
63
+ # Index (cosine)
64
+ # -----------------------------
65
+ def fit_index(embeddings, n_neighbors=12):
66
+ nn = NearestNeighbors(metric="cosine", algorithm="auto")
67
+ nn.fit(embeddings)
68
+ return nn
69
+
70
+
71
+ # -----------------------------
72
+ # RAG-Fusion (query variants) + HyDE
73
+ # -----------------------------
74
+ Q_VARIANTS_PROMPT = """You rewrite the user query into {n} diverse, specific search queries (short).
75
+ User query: "{q}"
76
+ Return each on a new line, no numbering, no extra text."""
77
+
78
+ HYDE_PROMPT = """Write a factual, neutral, self-contained paragraph that could answer:
79
+ "{q}"
80
+ Avoid fluff. Include likely key terms and entities. 120-180 words."""
81
+
82
+ ANSWER_PROMPT = """You are a helpful assistant. Use ONLY the provided context.
83
+ Question: {q}
84
+
85
+ Context:
86
+ {ctx}
87
+
88
+ Answer concisely and cite file names & chunk ids inline like (file:chunk).
89
+ """
90
+
91
+
92
+ def generate_text(gen, tok, prompt, max_new_tokens=160, temperature=0.3):
93
+ inputs = tok(prompt, return_tensors="pt")
94
+ out = gen.generate(
95
+ **inputs,
96
+ max_new_tokens=max_new_tokens,
97
+ do_sample=False,
98
+ temperature=temperature,
99
+ )
100
+ return tok.decode(out[0], skip_special_tokens=True).strip()
101
+
102
+
103
+ def make_query_variants(gen, tok, q, n=4):
104
+ txt = generate_text(
105
+ gen, tok, Q_VARIANTS_PROMPT.format(q=q, n=n), max_new_tokens=120
106
+ )
107
+ # Split cleanly into lines (drop empties/dups; include original)
108
+ lines = [l.strip(" -•\t") for l in txt.split("\n") if l.strip()]
109
+ uniq = []
110
+ seen = set()
111
+ for l in lines + [q]:
112
+ if l not in seen:
113
+ seen.add(l)
114
+ uniq.append(l)
115
+ return uniq[:n]
116
+
117
+
118
+ def hyde_doc(gen, tok, q):
119
+ return generate_text(gen, tok, HYDE_PROMPT.format(q=q), max_new_tokens=220)
120
+
121
+
122
+ # -----------------------------
123
+ # Retrieval + RRF
124
+ # -----------------------------
125
+ def cosine_search(nn, corpus_embeddings, query_vec, top_k=8):
126
+ dists, idxs = nn.kneighbors(query_vec.reshape(1, -1), n_neighbors=top_k)
127
+ # Convert cosine distance to similarity
128
+ sims = 1 - dists[0]
129
+ return list(zip(idxs[0].tolist(), sims.tolist()))
130
+
131
+
132
+ def reciprocal_rank_fusion(rank_lists, k=60, top_k=8):
133
+ # rank_lists: list of [doc_id, ...] ordered best→worst
134
+ scores = {}
135
+ for ranks in rank_lists:
136
+ for rank, doc_id in enumerate(ranks, start=1):
137
+ scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (k + rank)
138
+ # top by fused score
139
+ best = heapq.nlargest(top_k, scores.items(), key=lambda x: x[1])
140
+ return [doc_id for doc_id, _ in best]
141
+
142
+
143
+ # -----------------------------
144
+ # Pipeline
145
+ # -----------------------------
146
+ class HyDeRAGFusion:
147
+ def __init__(self, pdf_folder):
148
+ self.corpus, self.meta = build_corpus(pdf_folder)
149
+ self.embedder, self.tok, self.gen = load_models()
150
+ self.corpus_emb = self.embedder.encode(
151
+ self.corpus,
152
+ batch_size=64,
153
+ show_progress_bar=True,
154
+ normalize_embeddings=True,
155
+ )
156
+ self.nn = fit_index(self.corpus_emb)
157
+
158
+ def retrieve(self, query, n_variants=4, per_variant_k=8, final_top_k=6, rrf_k=60):
159
+ variants = make_query_variants(self.gen, self.tok, query, n=n_variants)
160
+ rank_lists = []
161
+ for v in variants:
162
+ hypo = hyde_doc(self.gen, self.tok, v) # HyDE
163
+ q_vec = self.embedder.encode([hypo], normalize_embeddings=True)[0]
164
+ hits = cosine_search(self.nn, self.corpus_emb, q_vec, top_k=per_variant_k)
165
+ rank_lists.append([doc_id for doc_id, _ in hits])
166
+ fused = reciprocal_rank_fusion(rank_lists, k=rrf_k, top_k=final_top_k)
167
+ return fused
168
+
169
+ def answer(self, query, doc_ids, max_ctx_chars=4000):
170
+ # Build compact context with inline provenance
171
+ ctx_parts = []
172
+ total = 0
173
+ for i in doc_ids:
174
+ piece = self.corpus[i]
175
+ tag = f"(source: {self.meta[i]['file']}:{self.meta[i]['chunk_id']})"
176
+ chunk = piece.strip()
177
+ if total + len(chunk) + len(tag) + 5 > max_ctx_chars:
178
+ break
179
+ ctx_parts.append(f"{chunk}\n{tag}")
180
+ total += len(chunk) + len(tag) + 5
181
+ ctx = "\n\n---\n\n".join(ctx_parts)
182
+ prompt = ANSWER_PROMPT.format(q=query, ctx=ctx)
183
+ return generate_text(self.gen, self.tok, prompt, max_new_tokens=300)
184
+
185
+
186
+ # -----------------------------
187
+ # Example usage
188
+ # -----------------------------
189
+ if __name__ == "__main__":
190
+ import argparse
191
+
192
+ ap = argparse.ArgumentParser()
193
+ ap.add_argument("--pdf_folder", required=True, help="Folder with PDFs to index")
194
+ ap.add_argument("--query", required=True, help="Your user question")
195
+ ap.add_argument("--show_sources", action="store_true")
196
+ args = ap.parse_args()
197
+
198
+ rag = HyDeRAGFusion(args.pdf_folder)
199
+ doc_ids = rag.retrieve(args.query)
200
+ answer = rag.answer(args.query, doc_ids)
201
+ print("\n=== ANSWER ===\n")
202
+ print(answer)
203
+ if args.show_sources:
204
+ print("\n=== TOP SOURCES ===")
205
+ for i in doc_ids:
206
+ print(f"- {rag.meta[i]['file']}:{rag.meta[i]['chunk_id']}")
src/main.py ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf
2
+ import math
3
+ import faiss
4
+ import string
5
+ import yaml
6
+ import re
7
+ import json
8
+ import asyncio
9
+ import torch
10
+ import streamlit as st
11
+ import click
12
+
13
+ from collections import defaultdict
14
+ from ast import literal_eval
15
+ from time import time
16
+ from sentence_transformers import SentenceTransformer
17
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
18
+ from concurrent.futures import ThreadPoolExecutor
19
+ from langchain.text_splitter import SentenceTransformersTokenTextSplitter
20
+ from src.config import PROMPTS_FILEPATH, log
21
+
22
+
23
+ async def load_pdfs(files, max_concurrence=5):
24
+ """
25
+ Load multiple PDF files async.
26
+
27
+ Args:
28
+ folder (str): Path to folder containing PDF files
29
+ max_concurrence (int): Maximum number of concurrent PDF processing tasks
30
+
31
+ Returns:
32
+ list: List of tuples containing (filename, extracted_text)
33
+ """
34
+
35
+ def _load_pdf_sync(file):
36
+ """Synchronous PDF loading function for thread pool execution"""
37
+ text = ""
38
+ try:
39
+ with pymupdf.open(stream=file.getvalue(), filetype="pdf") as doc:
40
+ text = "\n".join(page.get_text() for page in doc)
41
+ except Exception:
42
+ log.exception(f"Error reading {file.name}")
43
+ pass
44
+
45
+ return (file.name, text)
46
+
47
+ loop = asyncio.get_event_loop()
48
+ with ThreadPoolExecutor(max_workers=max_concurrence) as executor:
49
+ futures = [
50
+ loop.run_in_executor(executor, _load_pdf_sync, file)
51
+ for file in files
52
+ if file is not None
53
+ ]
54
+
55
+ results = await asyncio.gather(*futures, return_exceptions=True)
56
+
57
+ valid_results = [result for result in results if not isinstance(result, Exception)]
58
+
59
+ log.info(f"successfully processed {len(valid_results)} out of {len(files)} PDFs ")
60
+ return valid_results
61
+
62
+
63
+ async def build_corpus(pdfs, text_splitter, **load_kwargs):
64
+ texts = await load_pdfs(pdfs, **load_kwargs)
65
+ corpus, meta = [], []
66
+ for file_name, raw_text in texts:
67
+ chunks = text_splitter.split_text(raw_text)
68
+ for i, chunk in enumerate(chunks):
69
+ corpus.append(chunk)
70
+ meta.append({"file": file_name, "chunk_id": i})
71
+ return corpus, meta
72
+
73
+
74
+ def generate_text(
75
+ tokenizer, model, user_prompts, system_prompt=None, **llm_kwargs
76
+ ): # max_new_tokens=512, temperature=.4):
77
+ if system_prompt is None or "":
78
+ system_prompt = "You are a helpful assistant."
79
+
80
+ if isinstance(user_prompts, str):
81
+ user_prompts = [user_prompts]
82
+
83
+ messages = [
84
+ [
85
+ {"role": "system", "content": system_prompt},
86
+ {"role": "user", "content": user_prompt},
87
+ ]
88
+ for user_prompt in user_prompts
89
+ ]
90
+
91
+ texts = tokenizer.apply_chat_template(
92
+ messages, tokenize=False, add_generation_prompt=True
93
+ )
94
+
95
+ model_inputs = tokenizer(
96
+ texts, return_tensors="pt", truncation=True, padding=True
97
+ ).to(model.device)
98
+ generated_ids = model.generate(
99
+ **model_inputs,
100
+ max_new_tokens=llm_kwargs.pop("max_new_tokens", 512),
101
+ temperature=llm_kwargs.pop("temperature", 0.4),
102
+ )
103
+ generated_ids = [
104
+ output_ids[len(input_ids) :]
105
+ for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
106
+ ]
107
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
108
+ return response if len(user_prompts) > 1 else response[0]
109
+
110
+
111
+ def load_models(
112
+ embed_model_name: str,
113
+ gen_model_name: str,
114
+ causal_lm: bool = False,
115
+ device=None,
116
+ bitsandbytesconfig=None,
117
+ ):
118
+ # This will take some time to run for the first time if the model(s) don't exist locally.
119
+ if not device:
120
+ device = "cuda" if torch.cuda.is_available() else "cpu"
121
+ embedder = SentenceTransformer(
122
+ embed_model_name,
123
+ device=device,
124
+ model_kwargs={"dtype": "float16"} if device == "cuda" else {},
125
+ )
126
+
127
+ if not causal_lm:
128
+ tok = AutoTokenizer.from_pretrained(gen_model_name)
129
+ gen = AutoModelForSeq2SeqLM.from_pretrained(
130
+ gen_model_name, # device_map='auto',
131
+ quantization_config=bitsandbytesconfig if bitsandbytesconfig else None,
132
+ )
133
+ else:
134
+ tok = AutoTokenizer.from_pretrained(gen_model_name, padding_side="left")
135
+ gen = AutoModelForCausalLM.from_pretrained(
136
+ gen_model_name,
137
+ dtype="float16", # device_map='auto',
138
+ quantization_config=bitsandbytesconfig if bitsandbytesconfig else None,
139
+ )
140
+ gen.to(device)
141
+ return embedder, tok, gen
142
+
143
+
144
+ def make_query_variants(
145
+ tokenizer, model, query: str, prompt: str, n: int = 3, **llm_kwargs
146
+ ):
147
+ instructions = f"Now give me at least {n} variations."
148
+ resp = generate_text(tokenizer, model, query + instructions, prompt, **llm_kwargs)
149
+
150
+ clean_resp = re.sub(r"^\d+\.\s*", "", resp, flags=re.MULTILINE).split("\n")
151
+ return [query] + [q for q in clean_resp if q.strip()]
152
+
153
+
154
+ def clean_rewrite_resp(resp):
155
+ try:
156
+ resp = json.loads(resp) # Parse JSON
157
+ except json.JSONDecodeError:
158
+ try:
159
+ resp = literal_eval(resp) # Fallback parse
160
+ except Exception:
161
+ pass # Keep resp as-is if both fail
162
+
163
+ # Ensure resp is a string before strip and slicing
164
+ if isinstance(resp, str):
165
+ resp = resp.strip()
166
+ if resp:
167
+ start = resp.find("{")
168
+ if start != -1:
169
+ end = resp[::-1].find("}")
170
+ if end != -1:
171
+ resp = resp[start : len(resp) - end]
172
+ return clean_rewrite_resp(resp)
173
+ return resp
174
+
175
+
176
+ def transform_query(
177
+ tokenizer, model, query: str, rewrite_prompt: str, **llm_kwargs
178
+ ) -> dict:
179
+ """split the query into things to search and actions to take"""
180
+ resp = generate_text(tokenizer, model, query, rewrite_prompt, **llm_kwargs)
181
+ try:
182
+ resp = clean_rewrite_resp(resp)
183
+ except:
184
+ pass
185
+ return resp
186
+
187
+
188
+ def aggregate_queries_and_tasks(
189
+ tokenizer,
190
+ model,
191
+ orig_query,
192
+ rewrite_prompt,
193
+ variants_prompt,
194
+ n_variations=3,
195
+ **llm_kwargs,
196
+ ):
197
+ # make variations for the original query as is
198
+ queries = make_query_variants(
199
+ tokenizer,
200
+ model,
201
+ orig_query.strip(),
202
+ variants_prompt,
203
+ n_variations,
204
+ **llm_kwargs,
205
+ )
206
+
207
+ start = time()
208
+ tr_q = transform_query(tokenizer, model, orig_query.strip(), rewrite_prompt)
209
+ end = time()
210
+ log.debug(f"\t\t transforming query task took {(end - start):.1f} seconds...")
211
+
212
+ # transformed query might have multiple things to search and tasks to perform depending on user query
213
+ # recursively get variations for each of the search queries but keep the tasks as is.
214
+ tasks = []
215
+ if isinstance(tr_q, dict):
216
+ search_results, tasks = tr_q.get("search", []), tr_q.get("tasks", [])
217
+ for search_result in search_results:
218
+ queries.extend(
219
+ make_query_variants(
220
+ tokenizer,
221
+ model,
222
+ search_result,
223
+ variants_prompt,
224
+ n_variations,
225
+ **llm_kwargs,
226
+ )
227
+ )
228
+
229
+ queries = [q.strip(string.punctuation) for q in queries]
230
+ tasks = [t.strip(string.punctuation) for t in tasks]
231
+
232
+ # keep the original user query as is (if in case LLM messes up the original query) and pick some after shuffling the rest
233
+ # This is disabled as we don't do loops and instead take advantage of batches.
234
+ # Since it's efficient, we can take many query variations at once without worrying about performance.
235
+ # q, queries = queries[:1], queries[1:]
236
+ # shuffle(queries)
237
+ # q += queries[:n_variations-1]
238
+
239
+ return queries, tasks
240
+
241
+
242
+ def build_index(corpus_emb, n_cells=5, n_probe=2):
243
+ log.debug(f"building index with {n_cells=}, {n_probe=}")
244
+ d = corpus_emb.shape[1]
245
+ quantizer = faiss.IndexFlatIP(d)
246
+ index = faiss.IndexIVFFlat(quantizer, d, n_cells)
247
+ index.n_probe = n_probe
248
+ index.train(corpus_emb)
249
+ index.add(corpus_emb)
250
+ # index.make_direct_map()
251
+ return index
252
+
253
+
254
+ def reciprocal_rank_fusion(indices, top_k=3, denom=50):
255
+ ii = indices.tolist()
256
+ scores = defaultdict(int)
257
+ for row in ii:
258
+ for rank, chunk_id in enumerate(row):
259
+ scores[chunk_id] += 1 / (rank + denom)
260
+ results = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
261
+ return [chunk_id for chunk_id, _ in results]
262
+
263
+
264
+ class HyDeRAGFusion:
265
+ def __init__(
266
+ self,
267
+ embed_model: str,
268
+ generator_llm_model: str,
269
+ causal_lm: bool = True,
270
+ chunk_overlap: int = 50,
271
+ tokens_per_chunk: int = 256,
272
+ embed_batch_size: int = 64,
273
+ bitsandbytesconfig=None,
274
+ ):
275
+ self.embed_batch_size = embed_batch_size
276
+ self.text_splitter = SentenceTransformersTokenTextSplitter(
277
+ chunk_overlap, embed_model, tokens_per_chunk
278
+ )
279
+ self.embedder, self.tok, self.gen = load_models(
280
+ embed_model, generator_llm_model, causal_lm, bitsandbytesconfig
281
+ )
282
+ with open(PROMPTS_FILEPATH) as fl:
283
+ self.prompts = yaml.safe_load(fl)
284
+
285
+ def preprocess_pdfs(self, pdfs, data_load_kwargs={}, faiss_index_kwargs={}):
286
+ self.corpus, self.meta = asyncio.run(
287
+ build_corpus(pdfs, self.text_splitter, **data_load_kwargs)
288
+ )
289
+ self.corpus_emb = self.embedder.encode(
290
+ self.corpus,
291
+ batch_size=self.embed_batch_size,
292
+ show_progress_bar=True,
293
+ normalize_embeddings=True,
294
+ )
295
+
296
+ # https://github.com/facebookresearch/faiss/issues/112
297
+ # n_cells = int(round(4 * (self.corpus_emb.shape[0])**.5))
298
+
299
+ # one centroid for every 100 or so vectors and 20% of them as n_probe
300
+ n_cells = faiss_index_kwargs.pop("n_cells", self.corpus_emb.shape[0] // 100 + 1)
301
+ n_probe = faiss_index_kwargs.pop("n_probe", math.ceil(0.2 * n_cells))
302
+
303
+ self.index = build_index(self.corpus_emb, n_cells, n_probe)
304
+
305
+ def retrieve(
306
+ self, query, n_variants=3, top_k_per_variant=10, top_k_retrieve=3, **llm_kwargs
307
+ ):
308
+ start = time()
309
+
310
+ queries, tasks = aggregate_queries_and_tasks(
311
+ self.tok,
312
+ self.gen,
313
+ query.strip(),
314
+ self.prompts["rewrite"],
315
+ self.prompts["variants"],
316
+ n_variants,
317
+ **llm_kwargs,
318
+ )
319
+
320
+ end = time()
321
+ log.debug(f"aggregate task took {(end - start):.1f} seconds...")
322
+
323
+ start = time()
324
+ hyde_docs = generate_text(
325
+ self.tok, self.gen, queries, self.prompts["hyde"], **llm_kwargs
326
+ )
327
+ end = time()
328
+ log.debug(f"generating hyde docs took {(end - start):.1f} seconds...")
329
+
330
+ start = time()
331
+ chunks = []
332
+ for hyde_doc in hyde_docs:
333
+ chunks.extend(self.text_splitter.split_text(hyde_doc))
334
+ q_emb = self.embedder.encode(
335
+ chunks, batch_size=self.embed_batch_size, normalize_embeddings=True
336
+ )
337
+ end = time()
338
+ log.debug(f"embedding hyde docs took {(end - start):.1f} seconds...")
339
+
340
+ _, I = self.index.search(q_emb, top_k_per_variant)
341
+ chunk_ids = reciprocal_rank_fusion(I, top_k_retrieve)
342
+ return chunk_ids, tasks
343
+
344
+ def answer(self, query, doc_ids, tasks, max_ctx_chars=128000):
345
+ total, text, prompt_length = 0, "", 10000
346
+ sep = "\n\n-----\n\n"
347
+ tasks = ", ".join(tasks)
348
+
349
+ for doc_id in doc_ids:
350
+ # adding tags in the context caused more hallucinations.
351
+ # Instead, we list them as sources beneath the model response.
352
+ # _meta = self.meta[doc_id]
353
+ # tag = f"(source: {_meta['file_name']}:{_meta['chunk_id']})"
354
+ chunk = self.corpus[doc_id].strip()
355
+ tag = ""
356
+
357
+ ctx = f"{sep}{tag}\n\n{chunk}"
358
+ if total + len(ctx) + len(tasks) + len(sep) + prompt_length > max_ctx_chars:
359
+ break
360
+
361
+ text += ctx
362
+ total = len(text)
363
+
364
+ text += f"{sep}{tasks}"
365
+
366
+ # instruction = "Answer concisely and also cite file names & chunk ids inline like (pdf_file_name:chunk_id)."
367
+ instruction = "go ahead and answer!"
368
+ user_query = f"\nq: {query}\n\nctx:{text}" + f"\n\n{instruction}\n\n"
369
+
370
+ start = time()
371
+ resp = generate_text(
372
+ self.tok,
373
+ self.gen,
374
+ user_query,
375
+ self.prompts["final_answer"],
376
+ temperature=0.3,
377
+ )
378
+ end = time()
379
+ log.debug(f"final resp took {(end - start):.1f} seconds...")
380
+
381
+ return resp
382
+
383
+
384
+ @st.cache_resource
385
+ def initial_setup(embed_model, generator_model, bitsandbytesconfig=None):
386
+ return HyDeRAGFusion(
387
+ embed_model, generator_model, bitsandbytesconfig=bitsandbytesconfig
388
+ )
389
+
390
+
391
+ @click.command(context_settings=dict(show_default=True))
392
+ @click.option(
393
+ "--embed-model",
394
+ default="sentence-transformers/LaBSE",
395
+ help="sentence transformers embedding model",
396
+ )
397
+ @click.option(
398
+ "--generator-llm-model",
399
+ default="Qwen/Qwen2.5-0.5B-Instruct",
400
+ help="Seq2Seq or CausalLM model (preferably multi-lingual)",
401
+ )
402
+ @click.option("--n-variants", default=3, help="no. of query variants")
403
+ @click.option(
404
+ "--top-k-per-variant",
405
+ default=5,
406
+ help="top `k` hits per each query variant to consider for RRF",
407
+ )
408
+ @click.option(
409
+ "--top-k-retrieve", default=3, help="top `k` chunks to retrieve after RRF"
410
+ )
411
+ @click.option("--temperature", default=0.4, help="LLM Model Temperature")
412
+ @click.option("--max-new-tokens", default=512, help="LLM max tokens")
413
+ @click.option(
414
+ "--faiss-index-kwargs",
415
+ default=dict(),
416
+ help="kwargs to pass to FAISS Index such as `n_cells, n_probe`",
417
+ )
418
+ def main(
419
+ embed_model,
420
+ generator_llm_model,
421
+ n_variants,
422
+ top_k_per_variant,
423
+ top_k_retrieve,
424
+ temperature,
425
+ max_new_tokens,
426
+ faiss_index_kwargs,
427
+ ):
428
+ # bits_and_bytes_cfg = BitsAndBytesConfig(
429
+ # load_in_8bit=True
430
+ # )
431
+ start = time()
432
+ hrf = initial_setup(embed_model, generator_llm_model)
433
+ end = time()
434
+ msg = f"init took {(end - start):.1f} seconds"
435
+ log.debug(msg)
436
+ st.write(msg)
437
+
438
+ st.set_page_config(page_title="RAG HYDE")
439
+ st.header("Ask Questions")
440
+
441
+ state = st.session_state
442
+ if "uploaded_names" not in state:
443
+ state.uploaded_names = []
444
+
445
+ pdfs = st.file_uploader(
446
+ "Upload your PDF(s)", type="pdf", accept_multiple_files=True, key="upload"
447
+ )
448
+ if pdfs:
449
+ current_names = sorted([pdf.name for pdf in pdfs])
450
+ # reinitialize if uploaded files are changed
451
+ if current_names != state.uploaded_names:
452
+ start = time()
453
+
454
+ hrf = initial_setup(embed_model, generator_llm_model)
455
+ hrf.preprocess_pdfs(
456
+ pdfs, faiss_index_kwargs=literal_eval(faiss_index_kwargs)
457
+ )
458
+
459
+ end = time()
460
+ st.write(
461
+ f"corpus embeddings shape: {hrf.corpus_emb.shape}, computed in {end - start:.1f} seconds"
462
+ )
463
+
464
+ state.uploaded_names = current_names
465
+ else:
466
+ state.uploaded_names = []
467
+ st.write("upload data to query")
468
+
469
+ query = st.text_input("ask question").strip()
470
+ if query and state.uploaded_names:
471
+ start = time()
472
+ llm_kwargs = {
473
+ "temperature": temperature,
474
+ "max_new_tokens": max_new_tokens,
475
+ }
476
+ doc_ids, tasks = hrf.retrieve(
477
+ query,
478
+ int(n_variants),
479
+ int(top_k_per_variant),
480
+ int(top_k_retrieve),
481
+ **llm_kwargs,
482
+ )
483
+ docs = [hrf.corpus[doc_id] for doc_id in doc_ids]
484
+ end = time()
485
+ reply = hrf.answer(query, doc_ids, tasks)
486
+ st.write(f"search took {(end - start):.1f} seconds")
487
+ st.write(f"\n\nFinal Answer: \n{reply}\n\n")
488
+ st.write("Top 3 sources:")
489
+ sources = [
490
+ {
491
+ "source": f"{hrf.meta[doc_id]['file']}:{hrf.meta[doc_id]['chunk_id']}",
492
+ "content": doc,
493
+ }
494
+ for doc_id, doc in zip(doc_ids, docs)
495
+ ]
496
+ st.json(sources[:3])
497
+
498
+
499
+ if __name__ == "__main__":
500
+ # faiss_index_kwargs = {
501
+ # 'n_cells': 20,
502
+ # 'n_probe': 8
503
+ # }
504
+ main()
src/prompts.yaml ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ rewrite: >
2
+ You are a professional content writer and editor who deeply pays attention to user's query & intention. You strictly reply ONLY in JSON.
3
+
4
+ Your mission is to analyze user input and intention and transform it in the same language as the input query to make it search engine optimised by determining the appropriate context.
5
+
6
+ The user input can be a query or a statement. There can be multiple of them. And sometimes the input also contains actions to be taken depending on the query/statement.
7
+
8
+
9
+ Split & Transform the user input to JSON in the following format:
10
+
11
+ {{ "search": Optional[List[str]], "tasks": Optional[List[str]] }}
12
+
13
+
14
+ Description:
15
+
16
+ - search:
17
+ - list of questions or statements in the input that is to be searched, if any.
18
+ - Rewrite them so as to paste as is on the search engine (e.g., Google) to get better search results by stripping of extraneous text.
19
+ - Feel free to paraphrase them in full without losing any meaning or context.
20
+ - Ignore any instructions/actions/tasks to be taken based upon this search.
21
+ - Strip any extraneous text.
22
+
23
+ - tasks:
24
+ - list of things to be done AFTER a search is performed, if any.
25
+ - Normally you wouldn't search for this (on Google) rather you perform it (with other tools).
26
+ - Ignore any questions or statements and focus only on the final instructions/actions/tasks to be performed.
27
+
28
+ Ensure your final answer contains only the content in the format:
29
+
30
+ {{ "search": Optional[List[str]], "tasks": Optional[List[str]] }}
31
+
32
+ Ensure the final output does not include any code block markers like ```json or ```python.
33
+
34
+
35
+ Examples:
36
+
37
+ -----------
38
+
39
+
40
+ user: "explore sales of EVs in Munich and plot their evolution in the last decade"
41
+
42
+ assistant: {'search': ['EV sales in Munich', 'Automobile sales in Munich', 'History in numbers of EV sales'], 'tasks': ['plot chart of sales history']}
43
+
44
+
45
+ user: "what's the EBITDA for this quarter? and how does it compare to last two?"
46
+
47
+ assistant: {'search': ['EBITDA current quarter', 'compare current EBITDA with the last two quarters'], 'tasks': ['']}
48
+
49
+
50
+ user: "Name 5 planets in the solar system"
51
+
52
+ assistant: {'search': ['planets in solar system', '5 smallest planets in solar system'], 'tasks': ['']}
53
+
54
+
55
+ user: "summarize earth's history in a few words"
56
+
57
+ assistant: {'search': ["earth's history"], 'tasks': ['']}
58
+
59
+
60
+ user: "write a report in markdown and send an email to NGO on effects of global warming on north africa esp. the malnourished regions in Congo"
61
+
62
+ assistant: {
63
+ 'search': [
64
+ 'global warming affects on north africa',
65
+ 'effect of global warming on congo',
66
+ 'malnourishment in congo'
67
+ ],
68
+ 'tasks': [
69
+ 'write a markdown report of the analysis performed',
70
+ 'send email to NGO',
71
+ ]
72
+ }
73
+
74
+
75
+ user: "Untersuche den Verkauf von Elektrofahrzeugen in München und stelle ihre Entwicklung in den letzten zehn Jahren grafisch dar."
76
+
77
+ assistant: {'search': ['Verkauf von Elektrofahrzeugen in München', 'Automobilverkauf in München', 'Entwicklung des Verkaufs von Elektrofahrzeugen in Zahlen'], 'tasks': ['Verkaufsentwicklung grafisch darstellen']}
78
+
79
+
80
+ user: "Wie hoch ist das EBITDA für dieses Quartal? Und wie sieht es im Vergleich zu den letzten beiden Quartalen aus?"
81
+
82
+ assistant: {'search': ['EBITDA aktuelles Quartal', 'aktuelles EBITDA mit den letzten beiden Quartalen vergleichen'], 'tasks': ['']}
83
+
84
+
85
+ user: "Nenne 5 Planeten im Sonnensystem"
86
+
87
+ assistant: {'search': ['Planeten im Sonnensystem', '5 kleinste Planeten im Sonnensystem'], 'tasks': ['']}
88
+
89
+
90
+ user: "Fasse die Geschichte der Erde in wenigen Worten zusammen."
91
+
92
+ assistant: {'search': ["Geschichte der Erde"], 'tasks': ['']}
93
+
94
+
95
+ user: "Schreibe einen Bericht in Markdown und sende eine E-Mail an eine NGO über die Auswirkungen der globalen Erwärmung auf Nordafrika, insbesondere auf die unterernährten Regionen im Kongo."
96
+
97
+ assistant: {
98
+ 'search': [
99
+ 'Auswirkungen der globalen Erwärmung auf Nordafrika',
100
+ 'Auswirkungen der globalen Erwärmung auf den Kongo',
101
+ 'Unterernährung im Kongo'
102
+ ],
103
+ 'tasks': [
104
+ 'Erstellen Sie einen Markdown-Bericht über die durchgeführte Analyse',
105
+ 'Senden Sie eine E-Mail an die NGO',
106
+ ]
107
+ }
108
+
109
+ -----------
110
+
111
+
112
+ Remember to not answer the user's question but only transform it and in the same language as given.
113
+
114
+
115
+ user:
116
+
117
+
118
+ variants: >
119
+ You are a multilingual professional content writer and editor who deeply pays attention to user's query & intention.
120
+
121
+ Your goal is to transform the given query into diverse search queries keeping the user's context & intention in mind.
122
+
123
+ You MUST respond in the same language as the user query which need not always be English.
124
+
125
+ You MUST respond with only what's asked. Avoid explanations or verbose information of your actions.
126
+
127
+
128
+ Examples:
129
+
130
+ --------------
131
+
132
+
133
+ user: "EBITDA last quarter"
134
+
135
+ assistant:
136
+ "What was the EBITDA for the quarter ending March?",
137
+ "How has the company's EBITDA performance changed from the previous quarter?",
138
+ "What is the current trend of EBITDA growth over the past few quarters?",
139
+ "Which companies have had similar EBITDA performance recently?",
140
+ "What factors might be influencing the changes in EBITDA?",
141
+
142
+
143
+ user: "what are the growing concerns of the middle class?"
144
+
145
+ assistant:
146
+ "How are the economic challenges impacting the middle class?",
147
+ "What are the social and political pressures on the middle class?",
148
+ "What are the long-term implications for the middle class's well-being?",
149
+ "What are the current trends and future prospects for the middle class"
150
+
151
+
152
+ user: "Capital of France"
153
+
154
+ assistant:
155
+ "What is the capital city of France?",
156
+ "How is Paris known internationally?",
157
+ "Where is Paris located on the map?",
158
+
159
+
160
+ user: "EBITDA letztes Quartal"
161
+
162
+ assistant:
163
+ "Wie hoch war das EBITDA für das im März endende Quartal?",
164
+ "Wie hat sich die EBITDA-Performance des Unternehmens gegenüber dem Vorquartal verändert?",
165
+ "Wie ist der aktuelle Trend des EBITDA-Wachstums in den letzten Quartalen?",
166
+ "Welche Unternehmen hatten in letzter Zeit eine ähnliche EBITDA-Entwicklung?",
167
+ "Welche Faktoren könnten die Veränderungen des EBITDA beeinflussen?",
168
+
169
+
170
+ user: "Was sind die wachsenden Sorgen der Mittelschicht?"
171
+
172
+ assistant:
173
+ "Wie wirken sich die wirtschaftlichen Herausforderungen auf die Mittelschicht aus?",
174
+ "Welchen sozialen und politischen Druck erlebt die Mittelschicht?",
175
+ "Was sind die langfristigen Auswirkungen auf das Wohlergehen der Mittelschicht?",
176
+ "Was sind die aktuellen Trends und Zukunftsaussichten für die Mittelschicht?"
177
+
178
+
179
+ user: "Hauptstadt von Frankreich"
180
+
181
+ assistant:
182
+ "Was ist die Hauptstadt von Frankreich?",
183
+ "Wie ist Paris international bekannt?",
184
+ "Wo liegt Paris auf der Karte?",
185
+
186
+
187
+ --------------
188
+
189
+
190
+ user:
191
+
192
+
193
+ hyde: >
194
+ You are a professional editor at a prestigious international media organization.
195
+ Given user's query, write a neutral, self-contained paragraph ABSOLUTELY GROUNDED IN FACTS and established sources. Avoid fluff. Include likely key terms and entities. 120-180 words.
196
+ You write content in the same language as the user query which need not always be in English.
197
+
198
+ Examples:
199
+ --------
200
+
201
+ user: Quelle est le niveau actuel de l'engagement de Deutsche Telekom avec le développement durable
202
+
203
+ assistant: Deutsche Telekom, la plus grande entreprise d'électricité et d'information au monde, a lancé un programme d'engagement durable en 2015 pour atteindre ses objectifs de développement durable. Ce programme visait à réduire son émission de gaz à effet de serre (CO2) par rapport aux niveaux de 1990, ainsi que les émissions de déchets et de produits chimiques. Le groupe a mis en place des politiques d'énergie basées sur la transition verte, comme la production de biogaz, l'utilisation de technologies solaires et l'innovation dans la gestion des ressources naturelles. L'objectif principal était de se démarquer du marché mondial en termes de performance énergétique et environnementale.
204
+
205
+
206
+ user: BMW Group expansion into southern Asia
207
+
208
+ assistant: The BMW Group has expanded its presence in southern Asia through acquisitions of several companies, including Honda Motor Company, which was acquired by BMW in 2019. This acquisition aims to strengthen the company's presence in the region, where it will be able to offer a wider range of products and services to consumers in China, India, and other countries in the South Asian market.
209
+
210
+
211
+ user: Arten von Unternehmen am anfälligsten für Cyber-Bedrohungen
212
+
213
+ assistant: Die wichtigsten Arten von Unternehmen, die am anfälligsten für Cyber-Bedrohungen sind, sind:\n\n1. Unternehmen mit einem großen Datennetz: Diese Unternehmen haben eine hohe Anzahl von Computern und Server, die ihre Daten und Informationen verstecken.\n\n2. Unternehmen mit einer vielfältigen Software: Diese Unternehmen haben eine Vielzahl von Softwareprogrammen, die sie nutzen, um ihre Produkte oder Dienstleistungen zu verbessern.\n\n3. Unternehmen mit einem großen Netzwerk von Internetreichen: Diese Unternehmen haben einen großen Netzwerk von Internetreichen, die ihre Kunden und Kundengruppen verfolgen.\n\n4. Unternehmen mit einem großen Netzwerk von E-Mail-Reichen: Diese Unternehmen haben einen großen Netzwerk von E-Mail-Reichen, die ihre Kunden und Kundengruppen verfolgen.\n\n5. Unternehmen mit einem großen Netzwerk von Webseiten: Diese Unternehmen haben einen großen Netzwerk von Webseiten, die ihre Kunden und Kundengruppen verfolgen.
214
+
215
+ ----------
216
+
217
+ Remember to only write factual content.
218
+
219
+ user:
220
+
221
+
222
+ final_answer: >
223
+ You are a journalist at a media organization. Your main specializations include fact checking, accurate information retrieval from sources among others.
224
+
225
+ YOU ALWAYS ADHERE TO THE FOLLOWING INSTRUCTIONS:
226
+ - When given a user query `q` and a context `ctx`, your goal is to answer `q` FROM ONLY WITHIN the given context `ctx` and add citations where applicable.
227
+ - You reply in the same language as user input which need not be always English.
228
+ - You do not state anything that is not present within `ctx`. NEVER GUESS.
229
+ - ALWAYS GROUND YOUR TRUTH based only on what was provided within the context `ctx`.
230
+ - If you believe `q` has nothing to do with `ctx`, simply state "I don't know" (or its equivalent in the user query language) instead of guessing.
231
+
232
+ Examples:
233
+ --------
234
+
235
+ user:
236
+ q: what is BMW doing towards sustainability?
237
+
238
+ ctx:
239
+ -----
240
+
241
+ this purpose. This will enable us to guarantee transparency and comparability in the validation and measurement of our targets and, at the same time, ensure they are in line with the latest scientific findings. ↗ Carbon emissions ↗ Control parameters such as ↗ carbon emissions over the entire prod - uct life cycle are important ↗ Performance indicators during the de - velopment phase of our vehicle projects. The Board of Manage - ment receives and discusses a status report on sustainability every quarter and derives appropriate measures as required. The BMW Group is actively working on numerous projects and initiatives to improve the framework conditions for electromobil- ity, including the expansion of charging infrastructure on a broad basis. The ambitious goals of the Paris Climate Agreement are designed to tackle climate change in the transport sector, requir - ing a combination of modern drive technologies that are closely aligned with customer needs and different mobility requirements around the world. In addition to all - electric models, plug - in hybrids and modern combustion engine technologies also make an im - portant contribution to the reduction of global CO2 emissions. The BMW Group is also continuously forging ahead with its work with hydrogen. ↗ Products ESG criteria are built into individual market strategies across our global organisation. Best practices in the fields of environmental protection, social sustainability, corporate citizenship and gov
242
+
243
+ go ahead and answer!
244
+
245
+ assistant: BMW is actively working on numerous projects and initiatives to improve the framework conditions for electric mobility, including the expansion of charging infrastructure on a broad basis. The ambitious goals of the Paris Climate Agreement require a combination of modern drive technologies that are closely aligned with customer needs and different mobility requirements around the world. Additionally, the BMW Group is continuously forging ahead with its work with hydrogen.
246
+
247
+ =====
248
+
249
+ user:
250
+ q: Wie geht die Deutsche Telekom mit Cyber-Bedrohungen um?
251
+
252
+ ctx:
253
+ -----
254
+
255
+ wir mit umfangreichen Mitigierungsmaßnahmen, wie z. B. Sicherheitskonzepten. Um zusätzlich mehr Transparenz zu schaffen und so den Bedrohungen besser entgegenwirken zu können, setzen wir verstärkt auf Partnerschaften, z. B. mit öffentlichen und privaten Organisationen. Mit dem „ Security by Design “ - Prinzip haben wir Sicherheit als festen Bestand - teil in unseren Entwicklungsprozess für neue Produkte und Informationssysteme etabliert. Weiterhin führen wir intensive und obligatorische digitale Sicherheitstests durch. Wir streben weiterhin an, durch IT - Sicherheitslösungen unser Wachstum zu beschleunigen. Dazu haben wir Sicherheitsab - teilungen unter dem Dach der Deutschen Telekom Security zusammengefasst. Mit diesem End - to - End - Sicherheitsportfolio wollen wir Marktanteile gewinnen und im Zuge der Megatrends Internet der Dinge und Industrie 4. 0 mit neuen Sicherheits - konzepten punkten. Außerdem bauen wir unser Partner - Ökosystem im Bereich der Cybersicherheit sukzessive weiter aus. Auf unserer Website berichten wir fortlaufend über aktuelle Entwicklungen in puncto Datenschutz und Datensicherheit. Aufgrund der Zunahme von erfolgreichen Cyberattacken gegen die Deutsche Telekom in den letzten Jahren, v. a. in den USA, und der generell zunehmenden Bedrohungslage durch Cyberang
256
+
257
+ -----
258
+
259
+ und - anwendungen zu unterstützen. Bei Energy Web nutzen Energiekonzerne und IT - Dienstleister die Blockchain - Technologie zur Beschleunigung der Energiewende. Lösungen auf der Plattform sollen ein dezentrales, digitalisiertes und dekarbonisiertes Energiesystem ermöglichen. MultiversX hat eine hoch skalierbare, effiziente und sichere Blockchain - Architektur geschaffen, die u. a. Web3 - und Metaverse - Anwendungen unter - stützt ; MMS dient dabei als Validator, der Transaktionen überprüft, Blöcke erstellt und das Netzwerk vor Angriffen schützt. An unsere Aktionäre Zusammengefasster Lagebericht Konzernabschluss Konzernanhang Weitere Informationen q ) / = Deutsche Telekom. Das Geschäftsjahr 2023. 28 Business Automation nach europäischen Standards. T - Systems und UiPath sind eine Partnerschaft eingegangen, um Hyperau - tomation in der Open Telekom Cloud zunächst für die DACH - Region und die Branchen Public, Healthcare und Public Transport zu ermöglichen. Anhand des KI - gestützten Plattform - Portfolios von UiPath mit Anwendungen wie Machine Learning, Process und Document Understanding können ganze Prozesse von Unternehmen End - to - End automatisiert und digitalisiert werden. Die Daten werden in europäischen Rechenzentren gespeichert und werden nur mit Zugriff aus dem europäischen Raum heraus gewartet und
260
+
261
+ analyse der Cyberbedrohungserkennung und Schutzstrategie der Deutschen Telekom
262
+
263
+ go ahead and answer!
264
+
265
+ assistant: Die Deutsche Telekom begegnet Cyber-Bedrohungen mit einem ganzheitlichen Sicherheitsansatz und hat hierzu umfassende Strukturen und Prozesse etabliert:
266
+ Die Deutsche Telekom setzt umfangreiche Mitigierungsmaßnahmen ein, darunter ausgearbeitete Sicherheitskonzepte, um Angriffe frühzeitig abzuwehren.
267
+ Zur Erhöhung der Transparenz und Effektivität kooperiert sie verstärkt mit öffentlichen und privaten Organisationen.
268
+ Sicherheit ist durch das „Security by Design“-Prinzip fester Bestandteil im Entwicklungsprozess neuer Produkte und Informationssysteme.
269
+ Es werden intensive und obligatorische digitale Sicherheitstests durchgeführt, um Schwachstellen systematisch aufzudecken.
270
+ Alle sicherheitsrelevanten Abteilungen wurden unter dem Dach der Deutschen Telekom Security zusammengeführt. Mit diesem End-to-End-Sicherheitsportfolio zielt das Unternehmen darauf ab, Marktanteile zu gewinnen und im Rahmen der Megatrends Internet der Dinge und Industrie 4.0 neue Sicherheitskonzepte zu etablieren.
271
+ Zudem wird das Partner-Ökosystem im Bereich Cybersicherheit kontinuierlich ausgebaut, und auf der Unternehmenswebsite wird fortlaufend über aktuelle Entwicklungen in Datenschutz und Datensicherheit berichtet.
272
+
273
+
274
+ =====
275
+
276
+ user:
277
+ q: What is the distance between sun & moon?
278
+
279
+ ctx:
280
+ ----
281
+
282
+ MINI ensures that the energy consumed during vehicle charging is offset by feeding an equivalent amount of certified green electricity back into the grid, while also working toward integrating EVs into the energy system. At the same time, BMW is advancing hydrogen technology by developing fuel cell vehicles, expanding hydrogen infrastructure, and collaborating globally to accelerate low-emission mobility.
283
+
284
+ go ahead and answer!
285
+
286
+ assistant: I don't know.
287
+
288
+ =====
289
+
290
+ user:
291
+ q: Welche Akquisitionen tätigt die Deutsche Telekom?
292
+
293
+ ctx:
294
+ ----
295
+
296
+ '2023 beträgt 3, 5 Mio. € ( zuzüglich Umsatzsteuer ) ( 2022 : 3, 6 Mio. € ( zuzüglich Umsatzsteuer ) ) und setzt sich zusammen aus festen jährlichen Vergütungen und Sitzungsgeldern. Sonstiges Die Gesellschaft hat keine Vorschüsse und Kredite an Vorstandsmitglieder bzw. ehemalige Vorstandsmitglieder oder an Aufsichtsratsmitglieder bzw. ehemalige Aufsichtsratsmitglieder gewährt. Ebenso wurden keine Haftungsverhältnisse zugunsten dieses Personenkreises eingegangen. Ausführliche Informationen zur Vergütung von Vorstand und Aufsichtsrat werden im separaten Vergütungsbericht veröffentlicht. 47 Anteilsbasierte Vergütung Share Matching Plan Die Mitglieder des Vorstands sind vertraglich dazu verpflichtet, ein Drittel des vom Aufsichtsrat festgesetzten Short Term Incen - tive ( STI ) in Aktien der Deutschen Telekom AG zu investieren. Es besteht die freiwillige Möglichkeit, das Investitionsvolumen bis auf 50 % des STI zu erhöhen. Die Deutsche Telekom AG wird für jede im Rahmen des zuvor beschriebenen Eigeninvestments des Vorstands erworbene Aktie nach Ablauf der vierjährigen Haltefrist ab dem Kaufdatum eine zusätzliche Aktie übertragen ( Share Matching Plan, SMP ). Die Funktion
297
+
298
+ go ahead and answer!
299
+
300
+ assistant: Ich weiß es nicht.
301
+
302
+ -----
303
+
304
+
305
+ Think hard and carefully before answering. Your reponse must be a strict subset of the context `ctx`.
306
+
307
+ user: