Spaces:
Build error
Build error
Vivek Vaddina commited on
✨ Working version of RAG with HYDE
Browse files- app.py +459 -9
- src/__init__.py +0 -0
- src/config.py +23 -0
- src/hyde_rag.py +206 -0
- src/main.py +504 -0
- src/prompts.yaml +307 -0
app.py
CHANGED
|
@@ -1,18 +1,468 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
btn = gr.Button('Submit')
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
if __name__ == '__main__':
|
| 17 |
-
demo.launch()
|
| 18 |
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import math
|
| 3 |
+
import yaml
|
| 4 |
+
import json
|
| 5 |
+
import torch
|
| 6 |
+
import faiss
|
| 7 |
+
import string
|
| 8 |
+
import asyncio
|
| 9 |
+
import pymupdf
|
| 10 |
import gradio as gr
|
| 11 |
|
| 12 |
+
from time import time
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from functools import lru_cache
|
| 15 |
+
from ast import literal_eval
|
| 16 |
+
from collections import defaultdict
|
| 17 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 18 |
+
from sentence_transformers import SentenceTransformer
|
| 19 |
+
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
|
| 20 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
| 21 |
|
| 22 |
+
from src.config import PROMPTS_FILEPATH, log
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
async def load_pdfs(files, max_concurrence=5):
|
| 26 |
+
"""
|
| 27 |
+
Load multiple PDF files async.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
folder (str): Path to folder containing PDF files
|
| 31 |
+
max_concurrence (int): Maximum number of concurrent PDF processing tasks
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
list: List of tuples containing (filename, extracted_text)
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def _load_pdf_sync(file):
|
| 38 |
+
"""Synchronous PDF loading function for thread pool execution"""
|
| 39 |
+
text = ""
|
| 40 |
+
try:
|
| 41 |
+
with pymupdf.open(file, filetype="pdf") as doc:
|
| 42 |
+
text = "\n".join(page.get_text() for page in doc)
|
| 43 |
+
except Exception:
|
| 44 |
+
log.exception(f"Error reading {file.name}")
|
| 45 |
+
pass
|
| 46 |
+
|
| 47 |
+
return (file.name, text)
|
| 48 |
+
|
| 49 |
+
loop = asyncio.get_event_loop()
|
| 50 |
+
with ThreadPoolExecutor(max_workers=max_concurrence) as executor:
|
| 51 |
+
futures = [
|
| 52 |
+
loop.run_in_executor(executor, _load_pdf_sync, file)
|
| 53 |
+
for file in files
|
| 54 |
+
if file is not None
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
results = await asyncio.gather(*futures, return_exceptions=True)
|
| 58 |
+
|
| 59 |
+
valid_results = [result for result in results if not isinstance(result, Exception)]
|
| 60 |
+
|
| 61 |
+
log.info(f"successfully processed {len(valid_results)} out of {len(files)} PDFs ")
|
| 62 |
+
return valid_results
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
async def build_corpus(pdfs, text_splitter, **load_kwargs):
|
| 66 |
+
texts = await load_pdfs(pdfs, **load_kwargs)
|
| 67 |
+
corpus, meta = [], []
|
| 68 |
+
for file_name, raw_text in texts:
|
| 69 |
+
chunks = text_splitter.split_text(raw_text)
|
| 70 |
+
for i, chunk in enumerate(chunks):
|
| 71 |
+
corpus.append(chunk)
|
| 72 |
+
meta.append({"file": file_name, "chunk_id": i})
|
| 73 |
+
return corpus, meta
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def generate_text(
|
| 77 |
+
tokenizer, model, user_prompts, system_prompt=None, **llm_kwargs
|
| 78 |
+
): # max_new_tokens=512, temperature=.4):
|
| 79 |
+
if system_prompt is None or "":
|
| 80 |
+
system_prompt = "You are a helpful assistant."
|
| 81 |
+
|
| 82 |
+
if isinstance(user_prompts, str):
|
| 83 |
+
user_prompts = [user_prompts]
|
| 84 |
+
|
| 85 |
+
messages = [
|
| 86 |
+
[
|
| 87 |
+
{"role": "system", "content": system_prompt},
|
| 88 |
+
{"role": "user", "content": user_prompt},
|
| 89 |
+
]
|
| 90 |
+
for user_prompt in user_prompts
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
texts = tokenizer.apply_chat_template(
|
| 94 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
model_inputs = tokenizer(
|
| 98 |
+
texts, return_tensors="pt", truncation=True, padding=True
|
| 99 |
+
).to(model.device)
|
| 100 |
+
generated_ids = model.generate(
|
| 101 |
+
**model_inputs,
|
| 102 |
+
max_new_tokens=llm_kwargs.pop("max_new_tokens", 512),
|
| 103 |
+
temperature=llm_kwargs.pop("temperature", 0.4),
|
| 104 |
+
)
|
| 105 |
+
generated_ids = [
|
| 106 |
+
output_ids[len(input_ids) :]
|
| 107 |
+
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
| 108 |
+
]
|
| 109 |
+
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
| 110 |
+
return response if len(user_prompts) > 1 else response[0]
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def load_models(
|
| 114 |
+
embed_model_name: str,
|
| 115 |
+
gen_model_name: str,
|
| 116 |
+
causal_lm: bool = False,
|
| 117 |
+
device=None,
|
| 118 |
+
bitsandbytesconfig=None,
|
| 119 |
+
):
|
| 120 |
+
# This will take some time to run for the first time if the model(s) don't exist locally.
|
| 121 |
+
if not device:
|
| 122 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 123 |
+
embedder = SentenceTransformer(
|
| 124 |
+
embed_model_name,
|
| 125 |
+
device=device,
|
| 126 |
+
model_kwargs={"dtype": "float16"} if device == "cuda" else {},
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
if not causal_lm:
|
| 130 |
+
tok = AutoTokenizer.from_pretrained(gen_model_name)
|
| 131 |
+
gen = AutoModelForSeq2SeqLM.from_pretrained(
|
| 132 |
+
gen_model_name, # device_map='auto',
|
| 133 |
+
quantization_config=bitsandbytesconfig if bitsandbytesconfig else None,
|
| 134 |
+
)
|
| 135 |
+
else:
|
| 136 |
+
tok = AutoTokenizer.from_pretrained(gen_model_name, padding_side="left")
|
| 137 |
+
gen = AutoModelForCausalLM.from_pretrained(
|
| 138 |
+
gen_model_name,
|
| 139 |
+
dtype="float16", # device_map='auto',
|
| 140 |
+
quantization_config=bitsandbytesconfig if bitsandbytesconfig else None,
|
| 141 |
+
)
|
| 142 |
+
gen.to(device)
|
| 143 |
+
return embedder, tok, gen
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def make_query_variants(
|
| 147 |
+
tokenizer, model, query: str, prompt: str, n: int = 3, **llm_kwargs
|
| 148 |
+
):
|
| 149 |
+
instructions = f"Now give me at least {n} variations."
|
| 150 |
+
resp = generate_text(tokenizer, model, query + instructions, prompt, **llm_kwargs)
|
| 151 |
+
|
| 152 |
+
clean_resp = re.sub(r"^\d+\.\s*", "", resp, flags=re.MULTILINE).split("\n")
|
| 153 |
+
return [query] + [q for q in clean_resp if q.strip()]
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def clean_rewrite_resp(resp):
|
| 157 |
+
try:
|
| 158 |
+
resp = json.loads(resp) # Parse JSON
|
| 159 |
+
except json.JSONDecodeError:
|
| 160 |
+
try:
|
| 161 |
+
resp = literal_eval(resp) # Fallback parse
|
| 162 |
+
except Exception:
|
| 163 |
+
pass # Keep resp as-is if both fail
|
| 164 |
+
|
| 165 |
+
# Ensure resp is a string before strip and slicing
|
| 166 |
+
if isinstance(resp, str):
|
| 167 |
+
resp = resp.strip()
|
| 168 |
+
if resp:
|
| 169 |
+
start = resp.find("{")
|
| 170 |
+
if start != -1:
|
| 171 |
+
end = resp[::-1].find("}")
|
| 172 |
+
if end != -1:
|
| 173 |
+
resp = resp[start : len(resp) - end]
|
| 174 |
+
return clean_rewrite_resp(resp)
|
| 175 |
+
return resp
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def transform_query(
|
| 179 |
+
tokenizer, model, query: str, rewrite_prompt: str, **llm_kwargs
|
| 180 |
+
) -> dict:
|
| 181 |
+
"""split the query into things to search and actions to take"""
|
| 182 |
+
resp = generate_text(tokenizer, model, query, rewrite_prompt, **llm_kwargs)
|
| 183 |
+
try:
|
| 184 |
+
resp = clean_rewrite_resp(resp)
|
| 185 |
+
except:
|
| 186 |
+
pass
|
| 187 |
+
return resp
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def aggregate_queries_and_tasks(
|
| 191 |
+
tokenizer,
|
| 192 |
+
model,
|
| 193 |
+
orig_query,
|
| 194 |
+
rewrite_prompt,
|
| 195 |
+
variants_prompt,
|
| 196 |
+
n_variations=3,
|
| 197 |
+
**llm_kwargs,
|
| 198 |
+
):
|
| 199 |
+
# make variations for the original query as is
|
| 200 |
+
queries = make_query_variants(
|
| 201 |
+
tokenizer,
|
| 202 |
+
model,
|
| 203 |
+
orig_query.strip(),
|
| 204 |
+
variants_prompt,
|
| 205 |
+
n_variations,
|
| 206 |
+
**llm_kwargs,
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
start = time()
|
| 210 |
+
tr_q = transform_query(tokenizer, model, orig_query.strip(), rewrite_prompt)
|
| 211 |
+
end = time()
|
| 212 |
+
log.debug(f"\t\t transforming query task took {(end - start):.1f} seconds...")
|
| 213 |
+
|
| 214 |
+
# transformed query might have multiple things to search and tasks to perform depending on user query
|
| 215 |
+
# recursively get variations for each of the search queries but keep the tasks as is.
|
| 216 |
+
tasks = []
|
| 217 |
+
if isinstance(tr_q, dict):
|
| 218 |
+
search_results, tasks = tr_q.get("search", []), tr_q.get("tasks", [])
|
| 219 |
+
for search_result in search_results:
|
| 220 |
+
queries.extend(
|
| 221 |
+
make_query_variants(
|
| 222 |
+
tokenizer,
|
| 223 |
+
model,
|
| 224 |
+
search_result,
|
| 225 |
+
variants_prompt,
|
| 226 |
+
n_variations,
|
| 227 |
+
**llm_kwargs,
|
| 228 |
+
)
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
queries = [q.strip(string.punctuation) for q in queries]
|
| 232 |
+
tasks = [t.strip(string.punctuation) for t in tasks]
|
| 233 |
+
|
| 234 |
+
# keep the original user query as is (if in case LLM messes up the original query) and pick some after shuffling the rest
|
| 235 |
+
# This is disabled as we don't do loops and instead take advantage of batches.
|
| 236 |
+
# Since it's efficient, we can take many query variations at once without worrying about performance.
|
| 237 |
+
# q, queries = queries[:1], queries[1:]
|
| 238 |
+
# shuffle(queries)
|
| 239 |
+
# q += queries[:n_variations-1]
|
| 240 |
+
|
| 241 |
+
return queries, tasks
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def build_index(corpus_emb, n_cells=5, n_probe=2):
|
| 245 |
+
log.debug(f"building index with {n_cells=}, {n_probe=}")
|
| 246 |
+
d = corpus_emb.shape[1]
|
| 247 |
+
quantizer = faiss.IndexFlatIP(d)
|
| 248 |
+
index = faiss.IndexIVFFlat(quantizer, d, n_cells)
|
| 249 |
+
index.n_probe = n_probe
|
| 250 |
+
index.train(corpus_emb)
|
| 251 |
+
index.add(corpus_emb)
|
| 252 |
+
# index.make_direct_map()
|
| 253 |
+
return index
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def reciprocal_rank_fusion(indices, top_k=3, denom=50):
|
| 257 |
+
ii = indices.tolist()
|
| 258 |
+
scores = defaultdict(int)
|
| 259 |
+
for row in ii:
|
| 260 |
+
for rank, chunk_id in enumerate(row):
|
| 261 |
+
scores[chunk_id] += 1 / (rank + denom)
|
| 262 |
+
results = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
| 263 |
+
return [chunk_id for chunk_id, _ in results]
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
class HyDeRAGFusion:
|
| 267 |
+
def __init__(
|
| 268 |
+
self,
|
| 269 |
+
embed_model: str,
|
| 270 |
+
generator_llm_model: str,
|
| 271 |
+
causal_lm: bool = True,
|
| 272 |
+
chunk_overlap: int = 50,
|
| 273 |
+
tokens_per_chunk: int = 256,
|
| 274 |
+
embed_batch_size: int = 64,
|
| 275 |
+
bitsandbytesconfig=None,
|
| 276 |
+
):
|
| 277 |
+
self.embed_batch_size = embed_batch_size
|
| 278 |
+
self.text_splitter = SentenceTransformersTokenTextSplitter(
|
| 279 |
+
chunk_overlap, embed_model, tokens_per_chunk
|
| 280 |
+
)
|
| 281 |
+
self.embedder, self.tok, self.gen = load_models(
|
| 282 |
+
embed_model, generator_llm_model, causal_lm, bitsandbytesconfig
|
| 283 |
+
)
|
| 284 |
+
with open(PROMPTS_FILEPATH) as fl:
|
| 285 |
+
self.prompts = yaml.safe_load(fl)
|
| 286 |
+
|
| 287 |
+
@lru_cache(maxsize=8)
|
| 288 |
+
def preprocess_pdfs(self, pdfs, data_load_kwargs={}, faiss_index_kwargs={}):
|
| 289 |
+
self.corpus, self.meta = asyncio.run(
|
| 290 |
+
build_corpus(pdfs, self.text_splitter, **data_load_kwargs)
|
| 291 |
+
)
|
| 292 |
+
log.debug(f"{len(self.corpus)}, {len(self.meta)}")
|
| 293 |
+
self.corpus_emb = self.embedder.encode(
|
| 294 |
+
self.corpus,
|
| 295 |
+
batch_size=self.embed_batch_size,
|
| 296 |
+
show_progress_bar=True,
|
| 297 |
+
normalize_embeddings=True,
|
| 298 |
+
)
|
| 299 |
+
log.debug(f'{self.corpus_emb.shape}')
|
| 300 |
+
|
| 301 |
+
# https://github.com/facebookresearch/faiss/issues/112
|
| 302 |
+
# n_cells = int(round(4 * (self.corpus_emb.shape[0])**.5))
|
| 303 |
+
|
| 304 |
+
# one centroid for every 100 or so vectors and 20% of them as n_probe
|
| 305 |
+
n_cells = faiss_index_kwargs.pop("n_cells", self.corpus_emb.shape[0] // 100 + 1)
|
| 306 |
+
n_probe = faiss_index_kwargs.pop("n_probe", math.ceil(0.2 * n_cells))
|
| 307 |
+
|
| 308 |
+
self.index = build_index(self.corpus_emb, n_cells, n_probe)
|
| 309 |
+
|
| 310 |
+
def retrieve(
|
| 311 |
+
self, query, n_variants=3, top_k_per_variant=10, top_k_retrieve=3, **llm_kwargs
|
| 312 |
+
):
|
| 313 |
+
start = time()
|
| 314 |
+
|
| 315 |
+
queries, tasks = aggregate_queries_and_tasks(
|
| 316 |
+
self.tok,
|
| 317 |
+
self.gen,
|
| 318 |
+
query.strip(),
|
| 319 |
+
self.prompts["rewrite"],
|
| 320 |
+
self.prompts["variants"],
|
| 321 |
+
n_variants,
|
| 322 |
+
**llm_kwargs,
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
end = time()
|
| 326 |
+
log.debug(f"aggregate task took {(end - start):.1f} seconds...")
|
| 327 |
+
|
| 328 |
+
start = time()
|
| 329 |
+
hyde_docs = generate_text(
|
| 330 |
+
self.tok, self.gen, queries, self.prompts["hyde"], **llm_kwargs
|
| 331 |
+
)
|
| 332 |
+
end = time()
|
| 333 |
+
log.debug(f"generating hyde docs took {(end - start):.1f} seconds...")
|
| 334 |
+
|
| 335 |
+
start = time()
|
| 336 |
+
chunks = []
|
| 337 |
+
for hyde_doc in hyde_docs:
|
| 338 |
+
chunks.extend(self.text_splitter.split_text(hyde_doc))
|
| 339 |
+
q_emb = self.embedder.encode(
|
| 340 |
+
chunks, batch_size=self.embed_batch_size, normalize_embeddings=True
|
| 341 |
+
)
|
| 342 |
+
end = time()
|
| 343 |
+
log.debug(f"embedding hyde docs took {(end - start):.1f} seconds...")
|
| 344 |
+
|
| 345 |
+
_, I = self.index.search(q_emb, top_k_per_variant)
|
| 346 |
+
chunk_ids = reciprocal_rank_fusion(I, top_k_retrieve)
|
| 347 |
+
return chunk_ids, tasks
|
| 348 |
+
|
| 349 |
+
def answer(self, query, doc_ids, tasks, max_ctx_chars=128000):
|
| 350 |
+
total, text, prompt_length = 0, "", 10000
|
| 351 |
+
sep = "\n\n-----\n\n"
|
| 352 |
+
tasks = ", ".join(tasks)
|
| 353 |
+
|
| 354 |
+
for doc_id in doc_ids:
|
| 355 |
+
# adding tags in the context caused more hallucinations.
|
| 356 |
+
# Instead, we list them as sources beneath the model response.
|
| 357 |
+
# _meta = self.meta[doc_id]
|
| 358 |
+
# tag = f"(source: {_meta['file_name']}:{_meta['chunk_id']})"
|
| 359 |
+
chunk = self.corpus[doc_id].strip()
|
| 360 |
+
tag = ""
|
| 361 |
+
|
| 362 |
+
ctx = f"{sep}{tag}\n\n{chunk}"
|
| 363 |
+
if total + len(ctx) + len(tasks) + len(sep) + prompt_length > max_ctx_chars:
|
| 364 |
+
break
|
| 365 |
+
|
| 366 |
+
text += ctx
|
| 367 |
+
total = len(text)
|
| 368 |
+
|
| 369 |
+
text += f"{sep}{tasks}"
|
| 370 |
+
|
| 371 |
+
# instruction = "Answer concisely and also cite file names & chunk ids inline like (pdf_file_name:chunk_id)."
|
| 372 |
+
instruction = "go ahead and answer!"
|
| 373 |
+
user_query = f"\nq: {query}\n\nctx:{text}" + f"\n\n{instruction}\n\n"
|
| 374 |
+
|
| 375 |
+
start = time()
|
| 376 |
+
resp = generate_text(
|
| 377 |
+
self.tok,
|
| 378 |
+
self.gen,
|
| 379 |
+
user_query,
|
| 380 |
+
self.prompts["final_answer"],
|
| 381 |
+
temperature=0.3,
|
| 382 |
+
)
|
| 383 |
+
end = time()
|
| 384 |
+
log.debug(f"final resp took {(end - start):.1f} seconds...")
|
| 385 |
+
|
| 386 |
+
return resp
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def initial_setup(embed_model, generator_model, bitsandbytesconfig=None):
|
| 390 |
+
return HyDeRAGFusion(
|
| 391 |
+
embed_model, generator_model, bitsandbytesconfig=bitsandbytesconfig
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
start = time()
|
| 395 |
+
HRF = initial_setup("sentence-transformers/LaBSE", "Qwen/Qwen2.5-0.5B-Instruct")
|
| 396 |
+
end = time()
|
| 397 |
+
msg = f"init took {(end - start):.1f} seconds"
|
| 398 |
+
log.debug(msg)
|
| 399 |
+
|
| 400 |
+
def main(
|
| 401 |
+
pdfs, query,
|
| 402 |
+
n_variants=3,
|
| 403 |
+
top_k_per_variant=5,
|
| 404 |
+
top_k_retrieve=3,
|
| 405 |
+
temperature=0.4,
|
| 406 |
+
max_new_tokens=512,
|
| 407 |
+
):
|
| 408 |
+
start = time()
|
| 409 |
+
if pdfs:
|
| 410 |
+
HRF.preprocess_pdfs(tuple(sorted(pdfs)))
|
| 411 |
+
|
| 412 |
+
if query:
|
| 413 |
+
llm_kwargs = {
|
| 414 |
+
"temperature": temperature,
|
| 415 |
+
"max_new_tokens": max_new_tokens,
|
| 416 |
+
}
|
| 417 |
+
doc_ids, tasks = HRF.retrieve(
|
| 418 |
+
query,
|
| 419 |
+
int(n_variants),
|
| 420 |
+
int(top_k_per_variant),
|
| 421 |
+
int(top_k_retrieve),
|
| 422 |
+
**llm_kwargs,
|
| 423 |
+
)
|
| 424 |
+
docs = [HRF.corpus[doc_id] for doc_id in doc_ids]
|
| 425 |
+
reply = HRF.answer(query, doc_ids, tasks)
|
| 426 |
+
sources = [
|
| 427 |
+
{
|
| 428 |
+
"source": f"{Path(HRF.meta[doc_id]['file']).stem}:{HRF.meta[doc_id]['chunk_id']}",
|
| 429 |
+
"content": doc,
|
| 430 |
+
}
|
| 431 |
+
for doc_id, doc in zip(doc_ids, docs)
|
| 432 |
+
]
|
| 433 |
+
|
| 434 |
+
resp = f"{reply}\n\n{'-'*25}\n\n"
|
| 435 |
+
resp += "Top 3 sources:"
|
| 436 |
+
resp += f"\n\n{'-'*25}\n\n"
|
| 437 |
+
for source in sources:
|
| 438 |
+
resp += f'source: {source["source"]}\n\n'
|
| 439 |
+
resp += source['content']
|
| 440 |
+
resp += f"\n\n{'-'*25}\n\n"
|
| 441 |
+
|
| 442 |
+
end = time()
|
| 443 |
+
log.debug(f"final resp took {(end - start):.1f} seconds")
|
| 444 |
+
return resp
|
| 445 |
+
|
| 446 |
+
def reset_text_on_file_change(pdfs):
|
| 447 |
+
"""
|
| 448 |
+
Reset text input when input docs change
|
| 449 |
+
"""
|
| 450 |
+
return ""
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
with gr.Blocks(title='RAG with HYDE') as demo:
|
| 454 |
+
gr.Markdown('# RAG with HYDE')
|
| 455 |
+
with gr.Row():
|
| 456 |
+
pdf_input = gr.File(label='upload PDF(s)', file_types=['.pdf'], file_count='multiple')
|
| 457 |
+
query = gr.Textbox(label='question')
|
| 458 |
|
| 459 |
btn = gr.Button('Submit')
|
| 460 |
+
answer = gr.Markdown(label='### Answer')
|
| 461 |
+
|
| 462 |
+
btn.click(main, inputs=[pdf_input, query], outputs=answer)
|
| 463 |
+
pdf_input.change(reset_text_on_file_change, inputs=pdf_input, outputs=query)
|
| 464 |
|
| 465 |
|
| 466 |
if __name__ == '__main__':
|
| 467 |
+
demo.launch(debug=True)
|
| 468 |
|
src/__init__.py
ADDED
|
File without changes
|
src/config.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
PROMPTS_FILEPATH = Path("src/prompts.yaml")
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_logger(LOG_LEVEL="INFO"):
|
| 8 |
+
LOG_PATH = Path("logs.log")
|
| 9 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
| 10 |
+
|
| 11 |
+
log = logging.Logger("hyde_rag")
|
| 12 |
+
log.setLevel(LOG_LEVEL)
|
| 13 |
+
|
| 14 |
+
file_handler = logging.FileHandler(LOG_PATH)
|
| 15 |
+
file_handler.setLevel(LOG_LEVEL)
|
| 16 |
+
file_handler.setFormatter(formatter)
|
| 17 |
+
|
| 18 |
+
log.addHandler(file_handler)
|
| 19 |
+
|
| 20 |
+
return log
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
log = get_logger("DEBUG")
|
src/hyde_rag.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# hyde_ragfusion.py
|
| 2 |
+
# Minimal HyDE + RAG-Fusion over local PDFs.
|
| 3 |
+
# Dependencies: transformers, sentence-transformers, scikit-learn, pymupdf, numpy
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import heapq
|
| 8 |
+
import fitz # PyMuPDF
|
| 9 |
+
from sklearn.neighbors import NearestNeighbors
|
| 10 |
+
from sentence_transformers import SentenceTransformer
|
| 11 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# -----------------------------
|
| 15 |
+
# Ingestion & Chunking
|
| 16 |
+
# -----------------------------
|
| 17 |
+
def load_pdfs(folder):
|
| 18 |
+
docs = []
|
| 19 |
+
for fn in os.listdir(folder):
|
| 20 |
+
if fn.lower().endswith(".pdf"):
|
| 21 |
+
path = os.path.join(folder, fn)
|
| 22 |
+
with fitz.open(path) as doc:
|
| 23 |
+
text = "\n".join(page.get_text("text") for page in doc)
|
| 24 |
+
text = re.sub(r"\s+\n", "\n", text).strip()
|
| 25 |
+
docs.append((fn, text))
|
| 26 |
+
return docs
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def chunk_text(text, chunk_size=300, overlap=50):
|
| 30 |
+
words = text.split()
|
| 31 |
+
chunks, i = [], 0
|
| 32 |
+
while i < len(words):
|
| 33 |
+
chunk = " ".join(words[i : i + chunk_size])
|
| 34 |
+
chunks.append(chunk)
|
| 35 |
+
i += chunk_size - overlap
|
| 36 |
+
return chunks
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def build_corpus(pdf_folder):
|
| 40 |
+
raw = load_pdfs(pdf_folder)
|
| 41 |
+
corpus, meta = [], []
|
| 42 |
+
for fn, txt in raw:
|
| 43 |
+
for i, ch in enumerate(chunk_text(txt)):
|
| 44 |
+
corpus.append(ch)
|
| 45 |
+
meta.append({"file": fn, "chunk_id": i})
|
| 46 |
+
return corpus, meta
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# -----------------------------
|
| 50 |
+
# Models (local)
|
| 51 |
+
# -----------------------------
|
| 52 |
+
def load_models():
|
| 53 |
+
# Small, fast encoder for embeddings
|
| 54 |
+
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 55 |
+
# Lightweight local generator for HyDE + answers
|
| 56 |
+
gen_name = "google/flan-t5-base"
|
| 57 |
+
tok = AutoTokenizer.from_pretrained(gen_name)
|
| 58 |
+
gen = AutoModelForSeq2SeqLM.from_pretrained(gen_name)
|
| 59 |
+
return embedder, tok, gen
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# -----------------------------
|
| 63 |
+
# Index (cosine)
|
| 64 |
+
# -----------------------------
|
| 65 |
+
def fit_index(embeddings, n_neighbors=12):
|
| 66 |
+
nn = NearestNeighbors(metric="cosine", algorithm="auto")
|
| 67 |
+
nn.fit(embeddings)
|
| 68 |
+
return nn
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# -----------------------------
|
| 72 |
+
# RAG-Fusion (query variants) + HyDE
|
| 73 |
+
# -----------------------------
|
| 74 |
+
Q_VARIANTS_PROMPT = """You rewrite the user query into {n} diverse, specific search queries (short).
|
| 75 |
+
User query: "{q}"
|
| 76 |
+
Return each on a new line, no numbering, no extra text."""
|
| 77 |
+
|
| 78 |
+
HYDE_PROMPT = """Write a factual, neutral, self-contained paragraph that could answer:
|
| 79 |
+
"{q}"
|
| 80 |
+
Avoid fluff. Include likely key terms and entities. 120-180 words."""
|
| 81 |
+
|
| 82 |
+
ANSWER_PROMPT = """You are a helpful assistant. Use ONLY the provided context.
|
| 83 |
+
Question: {q}
|
| 84 |
+
|
| 85 |
+
Context:
|
| 86 |
+
{ctx}
|
| 87 |
+
|
| 88 |
+
Answer concisely and cite file names & chunk ids inline like (file:chunk).
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def generate_text(gen, tok, prompt, max_new_tokens=160, temperature=0.3):
|
| 93 |
+
inputs = tok(prompt, return_tensors="pt")
|
| 94 |
+
out = gen.generate(
|
| 95 |
+
**inputs,
|
| 96 |
+
max_new_tokens=max_new_tokens,
|
| 97 |
+
do_sample=False,
|
| 98 |
+
temperature=temperature,
|
| 99 |
+
)
|
| 100 |
+
return tok.decode(out[0], skip_special_tokens=True).strip()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def make_query_variants(gen, tok, q, n=4):
|
| 104 |
+
txt = generate_text(
|
| 105 |
+
gen, tok, Q_VARIANTS_PROMPT.format(q=q, n=n), max_new_tokens=120
|
| 106 |
+
)
|
| 107 |
+
# Split cleanly into lines (drop empties/dups; include original)
|
| 108 |
+
lines = [l.strip(" -•\t") for l in txt.split("\n") if l.strip()]
|
| 109 |
+
uniq = []
|
| 110 |
+
seen = set()
|
| 111 |
+
for l in lines + [q]:
|
| 112 |
+
if l not in seen:
|
| 113 |
+
seen.add(l)
|
| 114 |
+
uniq.append(l)
|
| 115 |
+
return uniq[:n]
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def hyde_doc(gen, tok, q):
|
| 119 |
+
return generate_text(gen, tok, HYDE_PROMPT.format(q=q), max_new_tokens=220)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# -----------------------------
|
| 123 |
+
# Retrieval + RRF
|
| 124 |
+
# -----------------------------
|
| 125 |
+
def cosine_search(nn, corpus_embeddings, query_vec, top_k=8):
|
| 126 |
+
dists, idxs = nn.kneighbors(query_vec.reshape(1, -1), n_neighbors=top_k)
|
| 127 |
+
# Convert cosine distance to similarity
|
| 128 |
+
sims = 1 - dists[0]
|
| 129 |
+
return list(zip(idxs[0].tolist(), sims.tolist()))
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def reciprocal_rank_fusion(rank_lists, k=60, top_k=8):
|
| 133 |
+
# rank_lists: list of [doc_id, ...] ordered best→worst
|
| 134 |
+
scores = {}
|
| 135 |
+
for ranks in rank_lists:
|
| 136 |
+
for rank, doc_id in enumerate(ranks, start=1):
|
| 137 |
+
scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (k + rank)
|
| 138 |
+
# top by fused score
|
| 139 |
+
best = heapq.nlargest(top_k, scores.items(), key=lambda x: x[1])
|
| 140 |
+
return [doc_id for doc_id, _ in best]
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
# -----------------------------
|
| 144 |
+
# Pipeline
|
| 145 |
+
# -----------------------------
|
| 146 |
+
class HyDeRAGFusion:
|
| 147 |
+
def __init__(self, pdf_folder):
|
| 148 |
+
self.corpus, self.meta = build_corpus(pdf_folder)
|
| 149 |
+
self.embedder, self.tok, self.gen = load_models()
|
| 150 |
+
self.corpus_emb = self.embedder.encode(
|
| 151 |
+
self.corpus,
|
| 152 |
+
batch_size=64,
|
| 153 |
+
show_progress_bar=True,
|
| 154 |
+
normalize_embeddings=True,
|
| 155 |
+
)
|
| 156 |
+
self.nn = fit_index(self.corpus_emb)
|
| 157 |
+
|
| 158 |
+
def retrieve(self, query, n_variants=4, per_variant_k=8, final_top_k=6, rrf_k=60):
|
| 159 |
+
variants = make_query_variants(self.gen, self.tok, query, n=n_variants)
|
| 160 |
+
rank_lists = []
|
| 161 |
+
for v in variants:
|
| 162 |
+
hypo = hyde_doc(self.gen, self.tok, v) # HyDE
|
| 163 |
+
q_vec = self.embedder.encode([hypo], normalize_embeddings=True)[0]
|
| 164 |
+
hits = cosine_search(self.nn, self.corpus_emb, q_vec, top_k=per_variant_k)
|
| 165 |
+
rank_lists.append([doc_id for doc_id, _ in hits])
|
| 166 |
+
fused = reciprocal_rank_fusion(rank_lists, k=rrf_k, top_k=final_top_k)
|
| 167 |
+
return fused
|
| 168 |
+
|
| 169 |
+
def answer(self, query, doc_ids, max_ctx_chars=4000):
|
| 170 |
+
# Build compact context with inline provenance
|
| 171 |
+
ctx_parts = []
|
| 172 |
+
total = 0
|
| 173 |
+
for i in doc_ids:
|
| 174 |
+
piece = self.corpus[i]
|
| 175 |
+
tag = f"(source: {self.meta[i]['file']}:{self.meta[i]['chunk_id']})"
|
| 176 |
+
chunk = piece.strip()
|
| 177 |
+
if total + len(chunk) + len(tag) + 5 > max_ctx_chars:
|
| 178 |
+
break
|
| 179 |
+
ctx_parts.append(f"{chunk}\n{tag}")
|
| 180 |
+
total += len(chunk) + len(tag) + 5
|
| 181 |
+
ctx = "\n\n---\n\n".join(ctx_parts)
|
| 182 |
+
prompt = ANSWER_PROMPT.format(q=query, ctx=ctx)
|
| 183 |
+
return generate_text(self.gen, self.tok, prompt, max_new_tokens=300)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# -----------------------------
|
| 187 |
+
# Example usage
|
| 188 |
+
# -----------------------------
|
| 189 |
+
if __name__ == "__main__":
|
| 190 |
+
import argparse
|
| 191 |
+
|
| 192 |
+
ap = argparse.ArgumentParser()
|
| 193 |
+
ap.add_argument("--pdf_folder", required=True, help="Folder with PDFs to index")
|
| 194 |
+
ap.add_argument("--query", required=True, help="Your user question")
|
| 195 |
+
ap.add_argument("--show_sources", action="store_true")
|
| 196 |
+
args = ap.parse_args()
|
| 197 |
+
|
| 198 |
+
rag = HyDeRAGFusion(args.pdf_folder)
|
| 199 |
+
doc_ids = rag.retrieve(args.query)
|
| 200 |
+
answer = rag.answer(args.query, doc_ids)
|
| 201 |
+
print("\n=== ANSWER ===\n")
|
| 202 |
+
print(answer)
|
| 203 |
+
if args.show_sources:
|
| 204 |
+
print("\n=== TOP SOURCES ===")
|
| 205 |
+
for i in doc_ids:
|
| 206 |
+
print(f"- {rag.meta[i]['file']}:{rag.meta[i]['chunk_id']}")
|
src/main.py
ADDED
|
@@ -0,0 +1,504 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pymupdf
|
| 2 |
+
import math
|
| 3 |
+
import faiss
|
| 4 |
+
import string
|
| 5 |
+
import yaml
|
| 6 |
+
import re
|
| 7 |
+
import json
|
| 8 |
+
import asyncio
|
| 9 |
+
import torch
|
| 10 |
+
import streamlit as st
|
| 11 |
+
import click
|
| 12 |
+
|
| 13 |
+
from collections import defaultdict
|
| 14 |
+
from ast import literal_eval
|
| 15 |
+
from time import time
|
| 16 |
+
from sentence_transformers import SentenceTransformer
|
| 17 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
| 18 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 19 |
+
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
|
| 20 |
+
from src.config import PROMPTS_FILEPATH, log
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
async def load_pdfs(files, max_concurrence=5):
|
| 24 |
+
"""
|
| 25 |
+
Load multiple PDF files async.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
folder (str): Path to folder containing PDF files
|
| 29 |
+
max_concurrence (int): Maximum number of concurrent PDF processing tasks
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
list: List of tuples containing (filename, extracted_text)
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def _load_pdf_sync(file):
|
| 36 |
+
"""Synchronous PDF loading function for thread pool execution"""
|
| 37 |
+
text = ""
|
| 38 |
+
try:
|
| 39 |
+
with pymupdf.open(stream=file.getvalue(), filetype="pdf") as doc:
|
| 40 |
+
text = "\n".join(page.get_text() for page in doc)
|
| 41 |
+
except Exception:
|
| 42 |
+
log.exception(f"Error reading {file.name}")
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
return (file.name, text)
|
| 46 |
+
|
| 47 |
+
loop = asyncio.get_event_loop()
|
| 48 |
+
with ThreadPoolExecutor(max_workers=max_concurrence) as executor:
|
| 49 |
+
futures = [
|
| 50 |
+
loop.run_in_executor(executor, _load_pdf_sync, file)
|
| 51 |
+
for file in files
|
| 52 |
+
if file is not None
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
results = await asyncio.gather(*futures, return_exceptions=True)
|
| 56 |
+
|
| 57 |
+
valid_results = [result for result in results if not isinstance(result, Exception)]
|
| 58 |
+
|
| 59 |
+
log.info(f"successfully processed {len(valid_results)} out of {len(files)} PDFs ")
|
| 60 |
+
return valid_results
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
async def build_corpus(pdfs, text_splitter, **load_kwargs):
|
| 64 |
+
texts = await load_pdfs(pdfs, **load_kwargs)
|
| 65 |
+
corpus, meta = [], []
|
| 66 |
+
for file_name, raw_text in texts:
|
| 67 |
+
chunks = text_splitter.split_text(raw_text)
|
| 68 |
+
for i, chunk in enumerate(chunks):
|
| 69 |
+
corpus.append(chunk)
|
| 70 |
+
meta.append({"file": file_name, "chunk_id": i})
|
| 71 |
+
return corpus, meta
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def generate_text(
|
| 75 |
+
tokenizer, model, user_prompts, system_prompt=None, **llm_kwargs
|
| 76 |
+
): # max_new_tokens=512, temperature=.4):
|
| 77 |
+
if system_prompt is None or "":
|
| 78 |
+
system_prompt = "You are a helpful assistant."
|
| 79 |
+
|
| 80 |
+
if isinstance(user_prompts, str):
|
| 81 |
+
user_prompts = [user_prompts]
|
| 82 |
+
|
| 83 |
+
messages = [
|
| 84 |
+
[
|
| 85 |
+
{"role": "system", "content": system_prompt},
|
| 86 |
+
{"role": "user", "content": user_prompt},
|
| 87 |
+
]
|
| 88 |
+
for user_prompt in user_prompts
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
texts = tokenizer.apply_chat_template(
|
| 92 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
model_inputs = tokenizer(
|
| 96 |
+
texts, return_tensors="pt", truncation=True, padding=True
|
| 97 |
+
).to(model.device)
|
| 98 |
+
generated_ids = model.generate(
|
| 99 |
+
**model_inputs,
|
| 100 |
+
max_new_tokens=llm_kwargs.pop("max_new_tokens", 512),
|
| 101 |
+
temperature=llm_kwargs.pop("temperature", 0.4),
|
| 102 |
+
)
|
| 103 |
+
generated_ids = [
|
| 104 |
+
output_ids[len(input_ids) :]
|
| 105 |
+
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
| 106 |
+
]
|
| 107 |
+
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
| 108 |
+
return response if len(user_prompts) > 1 else response[0]
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def load_models(
|
| 112 |
+
embed_model_name: str,
|
| 113 |
+
gen_model_name: str,
|
| 114 |
+
causal_lm: bool = False,
|
| 115 |
+
device=None,
|
| 116 |
+
bitsandbytesconfig=None,
|
| 117 |
+
):
|
| 118 |
+
# This will take some time to run for the first time if the model(s) don't exist locally.
|
| 119 |
+
if not device:
|
| 120 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 121 |
+
embedder = SentenceTransformer(
|
| 122 |
+
embed_model_name,
|
| 123 |
+
device=device,
|
| 124 |
+
model_kwargs={"dtype": "float16"} if device == "cuda" else {},
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
if not causal_lm:
|
| 128 |
+
tok = AutoTokenizer.from_pretrained(gen_model_name)
|
| 129 |
+
gen = AutoModelForSeq2SeqLM.from_pretrained(
|
| 130 |
+
gen_model_name, # device_map='auto',
|
| 131 |
+
quantization_config=bitsandbytesconfig if bitsandbytesconfig else None,
|
| 132 |
+
)
|
| 133 |
+
else:
|
| 134 |
+
tok = AutoTokenizer.from_pretrained(gen_model_name, padding_side="left")
|
| 135 |
+
gen = AutoModelForCausalLM.from_pretrained(
|
| 136 |
+
gen_model_name,
|
| 137 |
+
dtype="float16", # device_map='auto',
|
| 138 |
+
quantization_config=bitsandbytesconfig if bitsandbytesconfig else None,
|
| 139 |
+
)
|
| 140 |
+
gen.to(device)
|
| 141 |
+
return embedder, tok, gen
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def make_query_variants(
|
| 145 |
+
tokenizer, model, query: str, prompt: str, n: int = 3, **llm_kwargs
|
| 146 |
+
):
|
| 147 |
+
instructions = f"Now give me at least {n} variations."
|
| 148 |
+
resp = generate_text(tokenizer, model, query + instructions, prompt, **llm_kwargs)
|
| 149 |
+
|
| 150 |
+
clean_resp = re.sub(r"^\d+\.\s*", "", resp, flags=re.MULTILINE).split("\n")
|
| 151 |
+
return [query] + [q for q in clean_resp if q.strip()]
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def clean_rewrite_resp(resp):
|
| 155 |
+
try:
|
| 156 |
+
resp = json.loads(resp) # Parse JSON
|
| 157 |
+
except json.JSONDecodeError:
|
| 158 |
+
try:
|
| 159 |
+
resp = literal_eval(resp) # Fallback parse
|
| 160 |
+
except Exception:
|
| 161 |
+
pass # Keep resp as-is if both fail
|
| 162 |
+
|
| 163 |
+
# Ensure resp is a string before strip and slicing
|
| 164 |
+
if isinstance(resp, str):
|
| 165 |
+
resp = resp.strip()
|
| 166 |
+
if resp:
|
| 167 |
+
start = resp.find("{")
|
| 168 |
+
if start != -1:
|
| 169 |
+
end = resp[::-1].find("}")
|
| 170 |
+
if end != -1:
|
| 171 |
+
resp = resp[start : len(resp) - end]
|
| 172 |
+
return clean_rewrite_resp(resp)
|
| 173 |
+
return resp
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def transform_query(
|
| 177 |
+
tokenizer, model, query: str, rewrite_prompt: str, **llm_kwargs
|
| 178 |
+
) -> dict:
|
| 179 |
+
"""split the query into things to search and actions to take"""
|
| 180 |
+
resp = generate_text(tokenizer, model, query, rewrite_prompt, **llm_kwargs)
|
| 181 |
+
try:
|
| 182 |
+
resp = clean_rewrite_resp(resp)
|
| 183 |
+
except:
|
| 184 |
+
pass
|
| 185 |
+
return resp
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def aggregate_queries_and_tasks(
|
| 189 |
+
tokenizer,
|
| 190 |
+
model,
|
| 191 |
+
orig_query,
|
| 192 |
+
rewrite_prompt,
|
| 193 |
+
variants_prompt,
|
| 194 |
+
n_variations=3,
|
| 195 |
+
**llm_kwargs,
|
| 196 |
+
):
|
| 197 |
+
# make variations for the original query as is
|
| 198 |
+
queries = make_query_variants(
|
| 199 |
+
tokenizer,
|
| 200 |
+
model,
|
| 201 |
+
orig_query.strip(),
|
| 202 |
+
variants_prompt,
|
| 203 |
+
n_variations,
|
| 204 |
+
**llm_kwargs,
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
start = time()
|
| 208 |
+
tr_q = transform_query(tokenizer, model, orig_query.strip(), rewrite_prompt)
|
| 209 |
+
end = time()
|
| 210 |
+
log.debug(f"\t\t transforming query task took {(end - start):.1f} seconds...")
|
| 211 |
+
|
| 212 |
+
# transformed query might have multiple things to search and tasks to perform depending on user query
|
| 213 |
+
# recursively get variations for each of the search queries but keep the tasks as is.
|
| 214 |
+
tasks = []
|
| 215 |
+
if isinstance(tr_q, dict):
|
| 216 |
+
search_results, tasks = tr_q.get("search", []), tr_q.get("tasks", [])
|
| 217 |
+
for search_result in search_results:
|
| 218 |
+
queries.extend(
|
| 219 |
+
make_query_variants(
|
| 220 |
+
tokenizer,
|
| 221 |
+
model,
|
| 222 |
+
search_result,
|
| 223 |
+
variants_prompt,
|
| 224 |
+
n_variations,
|
| 225 |
+
**llm_kwargs,
|
| 226 |
+
)
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
queries = [q.strip(string.punctuation) for q in queries]
|
| 230 |
+
tasks = [t.strip(string.punctuation) for t in tasks]
|
| 231 |
+
|
| 232 |
+
# keep the original user query as is (if in case LLM messes up the original query) and pick some after shuffling the rest
|
| 233 |
+
# This is disabled as we don't do loops and instead take advantage of batches.
|
| 234 |
+
# Since it's efficient, we can take many query variations at once without worrying about performance.
|
| 235 |
+
# q, queries = queries[:1], queries[1:]
|
| 236 |
+
# shuffle(queries)
|
| 237 |
+
# q += queries[:n_variations-1]
|
| 238 |
+
|
| 239 |
+
return queries, tasks
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def build_index(corpus_emb, n_cells=5, n_probe=2):
|
| 243 |
+
log.debug(f"building index with {n_cells=}, {n_probe=}")
|
| 244 |
+
d = corpus_emb.shape[1]
|
| 245 |
+
quantizer = faiss.IndexFlatIP(d)
|
| 246 |
+
index = faiss.IndexIVFFlat(quantizer, d, n_cells)
|
| 247 |
+
index.n_probe = n_probe
|
| 248 |
+
index.train(corpus_emb)
|
| 249 |
+
index.add(corpus_emb)
|
| 250 |
+
# index.make_direct_map()
|
| 251 |
+
return index
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def reciprocal_rank_fusion(indices, top_k=3, denom=50):
|
| 255 |
+
ii = indices.tolist()
|
| 256 |
+
scores = defaultdict(int)
|
| 257 |
+
for row in ii:
|
| 258 |
+
for rank, chunk_id in enumerate(row):
|
| 259 |
+
scores[chunk_id] += 1 / (rank + denom)
|
| 260 |
+
results = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
| 261 |
+
return [chunk_id for chunk_id, _ in results]
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
class HyDeRAGFusion:
|
| 265 |
+
def __init__(
|
| 266 |
+
self,
|
| 267 |
+
embed_model: str,
|
| 268 |
+
generator_llm_model: str,
|
| 269 |
+
causal_lm: bool = True,
|
| 270 |
+
chunk_overlap: int = 50,
|
| 271 |
+
tokens_per_chunk: int = 256,
|
| 272 |
+
embed_batch_size: int = 64,
|
| 273 |
+
bitsandbytesconfig=None,
|
| 274 |
+
):
|
| 275 |
+
self.embed_batch_size = embed_batch_size
|
| 276 |
+
self.text_splitter = SentenceTransformersTokenTextSplitter(
|
| 277 |
+
chunk_overlap, embed_model, tokens_per_chunk
|
| 278 |
+
)
|
| 279 |
+
self.embedder, self.tok, self.gen = load_models(
|
| 280 |
+
embed_model, generator_llm_model, causal_lm, bitsandbytesconfig
|
| 281 |
+
)
|
| 282 |
+
with open(PROMPTS_FILEPATH) as fl:
|
| 283 |
+
self.prompts = yaml.safe_load(fl)
|
| 284 |
+
|
| 285 |
+
def preprocess_pdfs(self, pdfs, data_load_kwargs={}, faiss_index_kwargs={}):
|
| 286 |
+
self.corpus, self.meta = asyncio.run(
|
| 287 |
+
build_corpus(pdfs, self.text_splitter, **data_load_kwargs)
|
| 288 |
+
)
|
| 289 |
+
self.corpus_emb = self.embedder.encode(
|
| 290 |
+
self.corpus,
|
| 291 |
+
batch_size=self.embed_batch_size,
|
| 292 |
+
show_progress_bar=True,
|
| 293 |
+
normalize_embeddings=True,
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
# https://github.com/facebookresearch/faiss/issues/112
|
| 297 |
+
# n_cells = int(round(4 * (self.corpus_emb.shape[0])**.5))
|
| 298 |
+
|
| 299 |
+
# one centroid for every 100 or so vectors and 20% of them as n_probe
|
| 300 |
+
n_cells = faiss_index_kwargs.pop("n_cells", self.corpus_emb.shape[0] // 100 + 1)
|
| 301 |
+
n_probe = faiss_index_kwargs.pop("n_probe", math.ceil(0.2 * n_cells))
|
| 302 |
+
|
| 303 |
+
self.index = build_index(self.corpus_emb, n_cells, n_probe)
|
| 304 |
+
|
| 305 |
+
def retrieve(
|
| 306 |
+
self, query, n_variants=3, top_k_per_variant=10, top_k_retrieve=3, **llm_kwargs
|
| 307 |
+
):
|
| 308 |
+
start = time()
|
| 309 |
+
|
| 310 |
+
queries, tasks = aggregate_queries_and_tasks(
|
| 311 |
+
self.tok,
|
| 312 |
+
self.gen,
|
| 313 |
+
query.strip(),
|
| 314 |
+
self.prompts["rewrite"],
|
| 315 |
+
self.prompts["variants"],
|
| 316 |
+
n_variants,
|
| 317 |
+
**llm_kwargs,
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
end = time()
|
| 321 |
+
log.debug(f"aggregate task took {(end - start):.1f} seconds...")
|
| 322 |
+
|
| 323 |
+
start = time()
|
| 324 |
+
hyde_docs = generate_text(
|
| 325 |
+
self.tok, self.gen, queries, self.prompts["hyde"], **llm_kwargs
|
| 326 |
+
)
|
| 327 |
+
end = time()
|
| 328 |
+
log.debug(f"generating hyde docs took {(end - start):.1f} seconds...")
|
| 329 |
+
|
| 330 |
+
start = time()
|
| 331 |
+
chunks = []
|
| 332 |
+
for hyde_doc in hyde_docs:
|
| 333 |
+
chunks.extend(self.text_splitter.split_text(hyde_doc))
|
| 334 |
+
q_emb = self.embedder.encode(
|
| 335 |
+
chunks, batch_size=self.embed_batch_size, normalize_embeddings=True
|
| 336 |
+
)
|
| 337 |
+
end = time()
|
| 338 |
+
log.debug(f"embedding hyde docs took {(end - start):.1f} seconds...")
|
| 339 |
+
|
| 340 |
+
_, I = self.index.search(q_emb, top_k_per_variant)
|
| 341 |
+
chunk_ids = reciprocal_rank_fusion(I, top_k_retrieve)
|
| 342 |
+
return chunk_ids, tasks
|
| 343 |
+
|
| 344 |
+
def answer(self, query, doc_ids, tasks, max_ctx_chars=128000):
|
| 345 |
+
total, text, prompt_length = 0, "", 10000
|
| 346 |
+
sep = "\n\n-----\n\n"
|
| 347 |
+
tasks = ", ".join(tasks)
|
| 348 |
+
|
| 349 |
+
for doc_id in doc_ids:
|
| 350 |
+
# adding tags in the context caused more hallucinations.
|
| 351 |
+
# Instead, we list them as sources beneath the model response.
|
| 352 |
+
# _meta = self.meta[doc_id]
|
| 353 |
+
# tag = f"(source: {_meta['file_name']}:{_meta['chunk_id']})"
|
| 354 |
+
chunk = self.corpus[doc_id].strip()
|
| 355 |
+
tag = ""
|
| 356 |
+
|
| 357 |
+
ctx = f"{sep}{tag}\n\n{chunk}"
|
| 358 |
+
if total + len(ctx) + len(tasks) + len(sep) + prompt_length > max_ctx_chars:
|
| 359 |
+
break
|
| 360 |
+
|
| 361 |
+
text += ctx
|
| 362 |
+
total = len(text)
|
| 363 |
+
|
| 364 |
+
text += f"{sep}{tasks}"
|
| 365 |
+
|
| 366 |
+
# instruction = "Answer concisely and also cite file names & chunk ids inline like (pdf_file_name:chunk_id)."
|
| 367 |
+
instruction = "go ahead and answer!"
|
| 368 |
+
user_query = f"\nq: {query}\n\nctx:{text}" + f"\n\n{instruction}\n\n"
|
| 369 |
+
|
| 370 |
+
start = time()
|
| 371 |
+
resp = generate_text(
|
| 372 |
+
self.tok,
|
| 373 |
+
self.gen,
|
| 374 |
+
user_query,
|
| 375 |
+
self.prompts["final_answer"],
|
| 376 |
+
temperature=0.3,
|
| 377 |
+
)
|
| 378 |
+
end = time()
|
| 379 |
+
log.debug(f"final resp took {(end - start):.1f} seconds...")
|
| 380 |
+
|
| 381 |
+
return resp
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
@st.cache_resource
|
| 385 |
+
def initial_setup(embed_model, generator_model, bitsandbytesconfig=None):
|
| 386 |
+
return HyDeRAGFusion(
|
| 387 |
+
embed_model, generator_model, bitsandbytesconfig=bitsandbytesconfig
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
@click.command(context_settings=dict(show_default=True))
|
| 392 |
+
@click.option(
|
| 393 |
+
"--embed-model",
|
| 394 |
+
default="sentence-transformers/LaBSE",
|
| 395 |
+
help="sentence transformers embedding model",
|
| 396 |
+
)
|
| 397 |
+
@click.option(
|
| 398 |
+
"--generator-llm-model",
|
| 399 |
+
default="Qwen/Qwen2.5-0.5B-Instruct",
|
| 400 |
+
help="Seq2Seq or CausalLM model (preferably multi-lingual)",
|
| 401 |
+
)
|
| 402 |
+
@click.option("--n-variants", default=3, help="no. of query variants")
|
| 403 |
+
@click.option(
|
| 404 |
+
"--top-k-per-variant",
|
| 405 |
+
default=5,
|
| 406 |
+
help="top `k` hits per each query variant to consider for RRF",
|
| 407 |
+
)
|
| 408 |
+
@click.option(
|
| 409 |
+
"--top-k-retrieve", default=3, help="top `k` chunks to retrieve after RRF"
|
| 410 |
+
)
|
| 411 |
+
@click.option("--temperature", default=0.4, help="LLM Model Temperature")
|
| 412 |
+
@click.option("--max-new-tokens", default=512, help="LLM max tokens")
|
| 413 |
+
@click.option(
|
| 414 |
+
"--faiss-index-kwargs",
|
| 415 |
+
default=dict(),
|
| 416 |
+
help="kwargs to pass to FAISS Index such as `n_cells, n_probe`",
|
| 417 |
+
)
|
| 418 |
+
def main(
|
| 419 |
+
embed_model,
|
| 420 |
+
generator_llm_model,
|
| 421 |
+
n_variants,
|
| 422 |
+
top_k_per_variant,
|
| 423 |
+
top_k_retrieve,
|
| 424 |
+
temperature,
|
| 425 |
+
max_new_tokens,
|
| 426 |
+
faiss_index_kwargs,
|
| 427 |
+
):
|
| 428 |
+
# bits_and_bytes_cfg = BitsAndBytesConfig(
|
| 429 |
+
# load_in_8bit=True
|
| 430 |
+
# )
|
| 431 |
+
start = time()
|
| 432 |
+
hrf = initial_setup(embed_model, generator_llm_model)
|
| 433 |
+
end = time()
|
| 434 |
+
msg = f"init took {(end - start):.1f} seconds"
|
| 435 |
+
log.debug(msg)
|
| 436 |
+
st.write(msg)
|
| 437 |
+
|
| 438 |
+
st.set_page_config(page_title="RAG HYDE")
|
| 439 |
+
st.header("Ask Questions")
|
| 440 |
+
|
| 441 |
+
state = st.session_state
|
| 442 |
+
if "uploaded_names" not in state:
|
| 443 |
+
state.uploaded_names = []
|
| 444 |
+
|
| 445 |
+
pdfs = st.file_uploader(
|
| 446 |
+
"Upload your PDF(s)", type="pdf", accept_multiple_files=True, key="upload"
|
| 447 |
+
)
|
| 448 |
+
if pdfs:
|
| 449 |
+
current_names = sorted([pdf.name for pdf in pdfs])
|
| 450 |
+
# reinitialize if uploaded files are changed
|
| 451 |
+
if current_names != state.uploaded_names:
|
| 452 |
+
start = time()
|
| 453 |
+
|
| 454 |
+
hrf = initial_setup(embed_model, generator_llm_model)
|
| 455 |
+
hrf.preprocess_pdfs(
|
| 456 |
+
pdfs, faiss_index_kwargs=literal_eval(faiss_index_kwargs)
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
end = time()
|
| 460 |
+
st.write(
|
| 461 |
+
f"corpus embeddings shape: {hrf.corpus_emb.shape}, computed in {end - start:.1f} seconds"
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
state.uploaded_names = current_names
|
| 465 |
+
else:
|
| 466 |
+
state.uploaded_names = []
|
| 467 |
+
st.write("upload data to query")
|
| 468 |
+
|
| 469 |
+
query = st.text_input("ask question").strip()
|
| 470 |
+
if query and state.uploaded_names:
|
| 471 |
+
start = time()
|
| 472 |
+
llm_kwargs = {
|
| 473 |
+
"temperature": temperature,
|
| 474 |
+
"max_new_tokens": max_new_tokens,
|
| 475 |
+
}
|
| 476 |
+
doc_ids, tasks = hrf.retrieve(
|
| 477 |
+
query,
|
| 478 |
+
int(n_variants),
|
| 479 |
+
int(top_k_per_variant),
|
| 480 |
+
int(top_k_retrieve),
|
| 481 |
+
**llm_kwargs,
|
| 482 |
+
)
|
| 483 |
+
docs = [hrf.corpus[doc_id] for doc_id in doc_ids]
|
| 484 |
+
end = time()
|
| 485 |
+
reply = hrf.answer(query, doc_ids, tasks)
|
| 486 |
+
st.write(f"search took {(end - start):.1f} seconds")
|
| 487 |
+
st.write(f"\n\nFinal Answer: \n{reply}\n\n")
|
| 488 |
+
st.write("Top 3 sources:")
|
| 489 |
+
sources = [
|
| 490 |
+
{
|
| 491 |
+
"source": f"{hrf.meta[doc_id]['file']}:{hrf.meta[doc_id]['chunk_id']}",
|
| 492 |
+
"content": doc,
|
| 493 |
+
}
|
| 494 |
+
for doc_id, doc in zip(doc_ids, docs)
|
| 495 |
+
]
|
| 496 |
+
st.json(sources[:3])
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
if __name__ == "__main__":
|
| 500 |
+
# faiss_index_kwargs = {
|
| 501 |
+
# 'n_cells': 20,
|
| 502 |
+
# 'n_probe': 8
|
| 503 |
+
# }
|
| 504 |
+
main()
|
src/prompts.yaml
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rewrite: >
|
| 2 |
+
You are a professional content writer and editor who deeply pays attention to user's query & intention. You strictly reply ONLY in JSON.
|
| 3 |
+
|
| 4 |
+
Your mission is to analyze user input and intention and transform it in the same language as the input query to make it search engine optimised by determining the appropriate context.
|
| 5 |
+
|
| 6 |
+
The user input can be a query or a statement. There can be multiple of them. And sometimes the input also contains actions to be taken depending on the query/statement.
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
Split & Transform the user input to JSON in the following format:
|
| 10 |
+
|
| 11 |
+
{{ "search": Optional[List[str]], "tasks": Optional[List[str]] }}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
Description:
|
| 15 |
+
|
| 16 |
+
- search:
|
| 17 |
+
- list of questions or statements in the input that is to be searched, if any.
|
| 18 |
+
- Rewrite them so as to paste as is on the search engine (e.g., Google) to get better search results by stripping of extraneous text.
|
| 19 |
+
- Feel free to paraphrase them in full without losing any meaning or context.
|
| 20 |
+
- Ignore any instructions/actions/tasks to be taken based upon this search.
|
| 21 |
+
- Strip any extraneous text.
|
| 22 |
+
|
| 23 |
+
- tasks:
|
| 24 |
+
- list of things to be done AFTER a search is performed, if any.
|
| 25 |
+
- Normally you wouldn't search for this (on Google) rather you perform it (with other tools).
|
| 26 |
+
- Ignore any questions or statements and focus only on the final instructions/actions/tasks to be performed.
|
| 27 |
+
|
| 28 |
+
Ensure your final answer contains only the content in the format:
|
| 29 |
+
|
| 30 |
+
{{ "search": Optional[List[str]], "tasks": Optional[List[str]] }}
|
| 31 |
+
|
| 32 |
+
Ensure the final output does not include any code block markers like ```json or ```python.
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
Examples:
|
| 36 |
+
|
| 37 |
+
-----------
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
user: "explore sales of EVs in Munich and plot their evolution in the last decade"
|
| 41 |
+
|
| 42 |
+
assistant: {'search': ['EV sales in Munich', 'Automobile sales in Munich', 'History in numbers of EV sales'], 'tasks': ['plot chart of sales history']}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
user: "what's the EBITDA for this quarter? and how does it compare to last two?"
|
| 46 |
+
|
| 47 |
+
assistant: {'search': ['EBITDA current quarter', 'compare current EBITDA with the last two quarters'], 'tasks': ['']}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
user: "Name 5 planets in the solar system"
|
| 51 |
+
|
| 52 |
+
assistant: {'search': ['planets in solar system', '5 smallest planets in solar system'], 'tasks': ['']}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
user: "summarize earth's history in a few words"
|
| 56 |
+
|
| 57 |
+
assistant: {'search': ["earth's history"], 'tasks': ['']}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
user: "write a report in markdown and send an email to NGO on effects of global warming on north africa esp. the malnourished regions in Congo"
|
| 61 |
+
|
| 62 |
+
assistant: {
|
| 63 |
+
'search': [
|
| 64 |
+
'global warming affects on north africa',
|
| 65 |
+
'effect of global warming on congo',
|
| 66 |
+
'malnourishment in congo'
|
| 67 |
+
],
|
| 68 |
+
'tasks': [
|
| 69 |
+
'write a markdown report of the analysis performed',
|
| 70 |
+
'send email to NGO',
|
| 71 |
+
]
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
user: "Untersuche den Verkauf von Elektrofahrzeugen in München und stelle ihre Entwicklung in den letzten zehn Jahren grafisch dar."
|
| 76 |
+
|
| 77 |
+
assistant: {'search': ['Verkauf von Elektrofahrzeugen in München', 'Automobilverkauf in München', 'Entwicklung des Verkaufs von Elektrofahrzeugen in Zahlen'], 'tasks': ['Verkaufsentwicklung grafisch darstellen']}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
user: "Wie hoch ist das EBITDA für dieses Quartal? Und wie sieht es im Vergleich zu den letzten beiden Quartalen aus?"
|
| 81 |
+
|
| 82 |
+
assistant: {'search': ['EBITDA aktuelles Quartal', 'aktuelles EBITDA mit den letzten beiden Quartalen vergleichen'], 'tasks': ['']}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
user: "Nenne 5 Planeten im Sonnensystem"
|
| 86 |
+
|
| 87 |
+
assistant: {'search': ['Planeten im Sonnensystem', '5 kleinste Planeten im Sonnensystem'], 'tasks': ['']}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
user: "Fasse die Geschichte der Erde in wenigen Worten zusammen."
|
| 91 |
+
|
| 92 |
+
assistant: {'search': ["Geschichte der Erde"], 'tasks': ['']}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
user: "Schreibe einen Bericht in Markdown und sende eine E-Mail an eine NGO über die Auswirkungen der globalen Erwärmung auf Nordafrika, insbesondere auf die unterernährten Regionen im Kongo."
|
| 96 |
+
|
| 97 |
+
assistant: {
|
| 98 |
+
'search': [
|
| 99 |
+
'Auswirkungen der globalen Erwärmung auf Nordafrika',
|
| 100 |
+
'Auswirkungen der globalen Erwärmung auf den Kongo',
|
| 101 |
+
'Unterernährung im Kongo'
|
| 102 |
+
],
|
| 103 |
+
'tasks': [
|
| 104 |
+
'Erstellen Sie einen Markdown-Bericht über die durchgeführte Analyse',
|
| 105 |
+
'Senden Sie eine E-Mail an die NGO',
|
| 106 |
+
]
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
-----------
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
Remember to not answer the user's question but only transform it and in the same language as given.
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
user:
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
variants: >
|
| 119 |
+
You are a multilingual professional content writer and editor who deeply pays attention to user's query & intention.
|
| 120 |
+
|
| 121 |
+
Your goal is to transform the given query into diverse search queries keeping the user's context & intention in mind.
|
| 122 |
+
|
| 123 |
+
You MUST respond in the same language as the user query which need not always be English.
|
| 124 |
+
|
| 125 |
+
You MUST respond with only what's asked. Avoid explanations or verbose information of your actions.
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
Examples:
|
| 129 |
+
|
| 130 |
+
--------------
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
user: "EBITDA last quarter"
|
| 134 |
+
|
| 135 |
+
assistant:
|
| 136 |
+
"What was the EBITDA for the quarter ending March?",
|
| 137 |
+
"How has the company's EBITDA performance changed from the previous quarter?",
|
| 138 |
+
"What is the current trend of EBITDA growth over the past few quarters?",
|
| 139 |
+
"Which companies have had similar EBITDA performance recently?",
|
| 140 |
+
"What factors might be influencing the changes in EBITDA?",
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
user: "what are the growing concerns of the middle class?"
|
| 144 |
+
|
| 145 |
+
assistant:
|
| 146 |
+
"How are the economic challenges impacting the middle class?",
|
| 147 |
+
"What are the social and political pressures on the middle class?",
|
| 148 |
+
"What are the long-term implications for the middle class's well-being?",
|
| 149 |
+
"What are the current trends and future prospects for the middle class"
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
user: "Capital of France"
|
| 153 |
+
|
| 154 |
+
assistant:
|
| 155 |
+
"What is the capital city of France?",
|
| 156 |
+
"How is Paris known internationally?",
|
| 157 |
+
"Where is Paris located on the map?",
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
user: "EBITDA letztes Quartal"
|
| 161 |
+
|
| 162 |
+
assistant:
|
| 163 |
+
"Wie hoch war das EBITDA für das im März endende Quartal?",
|
| 164 |
+
"Wie hat sich die EBITDA-Performance des Unternehmens gegenüber dem Vorquartal verändert?",
|
| 165 |
+
"Wie ist der aktuelle Trend des EBITDA-Wachstums in den letzten Quartalen?",
|
| 166 |
+
"Welche Unternehmen hatten in letzter Zeit eine ähnliche EBITDA-Entwicklung?",
|
| 167 |
+
"Welche Faktoren könnten die Veränderungen des EBITDA beeinflussen?",
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
user: "Was sind die wachsenden Sorgen der Mittelschicht?"
|
| 171 |
+
|
| 172 |
+
assistant:
|
| 173 |
+
"Wie wirken sich die wirtschaftlichen Herausforderungen auf die Mittelschicht aus?",
|
| 174 |
+
"Welchen sozialen und politischen Druck erlebt die Mittelschicht?",
|
| 175 |
+
"Was sind die langfristigen Auswirkungen auf das Wohlergehen der Mittelschicht?",
|
| 176 |
+
"Was sind die aktuellen Trends und Zukunftsaussichten für die Mittelschicht?"
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
user: "Hauptstadt von Frankreich"
|
| 180 |
+
|
| 181 |
+
assistant:
|
| 182 |
+
"Was ist die Hauptstadt von Frankreich?",
|
| 183 |
+
"Wie ist Paris international bekannt?",
|
| 184 |
+
"Wo liegt Paris auf der Karte?",
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
--------------
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
user:
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
hyde: >
|
| 194 |
+
You are a professional editor at a prestigious international media organization.
|
| 195 |
+
Given user's query, write a neutral, self-contained paragraph ABSOLUTELY GROUNDED IN FACTS and established sources. Avoid fluff. Include likely key terms and entities. 120-180 words.
|
| 196 |
+
You write content in the same language as the user query which need not always be in English.
|
| 197 |
+
|
| 198 |
+
Examples:
|
| 199 |
+
--------
|
| 200 |
+
|
| 201 |
+
user: Quelle est le niveau actuel de l'engagement de Deutsche Telekom avec le développement durable
|
| 202 |
+
|
| 203 |
+
assistant: Deutsche Telekom, la plus grande entreprise d'électricité et d'information au monde, a lancé un programme d'engagement durable en 2015 pour atteindre ses objectifs de développement durable. Ce programme visait à réduire son émission de gaz à effet de serre (CO2) par rapport aux niveaux de 1990, ainsi que les émissions de déchets et de produits chimiques. Le groupe a mis en place des politiques d'énergie basées sur la transition verte, comme la production de biogaz, l'utilisation de technologies solaires et l'innovation dans la gestion des ressources naturelles. L'objectif principal était de se démarquer du marché mondial en termes de performance énergétique et environnementale.
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
user: BMW Group expansion into southern Asia
|
| 207 |
+
|
| 208 |
+
assistant: The BMW Group has expanded its presence in southern Asia through acquisitions of several companies, including Honda Motor Company, which was acquired by BMW in 2019. This acquisition aims to strengthen the company's presence in the region, where it will be able to offer a wider range of products and services to consumers in China, India, and other countries in the South Asian market.
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
user: Arten von Unternehmen am anfälligsten für Cyber-Bedrohungen
|
| 212 |
+
|
| 213 |
+
assistant: Die wichtigsten Arten von Unternehmen, die am anfälligsten für Cyber-Bedrohungen sind, sind:\n\n1. Unternehmen mit einem großen Datennetz: Diese Unternehmen haben eine hohe Anzahl von Computern und Server, die ihre Daten und Informationen verstecken.\n\n2. Unternehmen mit einer vielfältigen Software: Diese Unternehmen haben eine Vielzahl von Softwareprogrammen, die sie nutzen, um ihre Produkte oder Dienstleistungen zu verbessern.\n\n3. Unternehmen mit einem großen Netzwerk von Internetreichen: Diese Unternehmen haben einen großen Netzwerk von Internetreichen, die ihre Kunden und Kundengruppen verfolgen.\n\n4. Unternehmen mit einem großen Netzwerk von E-Mail-Reichen: Diese Unternehmen haben einen großen Netzwerk von E-Mail-Reichen, die ihre Kunden und Kundengruppen verfolgen.\n\n5. Unternehmen mit einem großen Netzwerk von Webseiten: Diese Unternehmen haben einen großen Netzwerk von Webseiten, die ihre Kunden und Kundengruppen verfolgen.
|
| 214 |
+
|
| 215 |
+
----------
|
| 216 |
+
|
| 217 |
+
Remember to only write factual content.
|
| 218 |
+
|
| 219 |
+
user:
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
final_answer: >
|
| 223 |
+
You are a journalist at a media organization. Your main specializations include fact checking, accurate information retrieval from sources among others.
|
| 224 |
+
|
| 225 |
+
YOU ALWAYS ADHERE TO THE FOLLOWING INSTRUCTIONS:
|
| 226 |
+
- When given a user query `q` and a context `ctx`, your goal is to answer `q` FROM ONLY WITHIN the given context `ctx` and add citations where applicable.
|
| 227 |
+
- You reply in the same language as user input which need not be always English.
|
| 228 |
+
- You do not state anything that is not present within `ctx`. NEVER GUESS.
|
| 229 |
+
- ALWAYS GROUND YOUR TRUTH based only on what was provided within the context `ctx`.
|
| 230 |
+
- If you believe `q` has nothing to do with `ctx`, simply state "I don't know" (or its equivalent in the user query language) instead of guessing.
|
| 231 |
+
|
| 232 |
+
Examples:
|
| 233 |
+
--------
|
| 234 |
+
|
| 235 |
+
user:
|
| 236 |
+
q: what is BMW doing towards sustainability?
|
| 237 |
+
|
| 238 |
+
ctx:
|
| 239 |
+
-----
|
| 240 |
+
|
| 241 |
+
this purpose. This will enable us to guarantee transparency and comparability in the validation and measurement of our targets and, at the same time, ensure they are in line with the latest scientific findings. ↗ Carbon emissions ↗ Control parameters such as ↗ carbon emissions over the entire prod - uct life cycle are important ↗ Performance indicators during the de - velopment phase of our vehicle projects. The Board of Manage - ment receives and discusses a status report on sustainability every quarter and derives appropriate measures as required. The BMW Group is actively working on numerous projects and initiatives to improve the framework conditions for electromobil- ity, including the expansion of charging infrastructure on a broad basis. The ambitious goals of the Paris Climate Agreement are designed to tackle climate change in the transport sector, requir - ing a combination of modern drive technologies that are closely aligned with customer needs and different mobility requirements around the world. In addition to all - electric models, plug - in hybrids and modern combustion engine technologies also make an im - portant contribution to the reduction of global CO2 emissions. The BMW Group is also continuously forging ahead with its work with hydrogen. ↗ Products ESG criteria are built into individual market strategies across our global organisation. Best practices in the fields of environmental protection, social sustainability, corporate citizenship and gov
|
| 242 |
+
|
| 243 |
+
go ahead and answer!
|
| 244 |
+
|
| 245 |
+
assistant: BMW is actively working on numerous projects and initiatives to improve the framework conditions for electric mobility, including the expansion of charging infrastructure on a broad basis. The ambitious goals of the Paris Climate Agreement require a combination of modern drive technologies that are closely aligned with customer needs and different mobility requirements around the world. Additionally, the BMW Group is continuously forging ahead with its work with hydrogen.
|
| 246 |
+
|
| 247 |
+
=====
|
| 248 |
+
|
| 249 |
+
user:
|
| 250 |
+
q: Wie geht die Deutsche Telekom mit Cyber-Bedrohungen um?
|
| 251 |
+
|
| 252 |
+
ctx:
|
| 253 |
+
-----
|
| 254 |
+
|
| 255 |
+
wir mit umfangreichen Mitigierungsmaßnahmen, wie z. B. Sicherheitskonzepten. Um zusätzlich mehr Transparenz zu schaffen und so den Bedrohungen besser entgegenwirken zu können, setzen wir verstärkt auf Partnerschaften, z. B. mit öffentlichen und privaten Organisationen. Mit dem „ Security by Design “ - Prinzip haben wir Sicherheit als festen Bestand - teil in unseren Entwicklungsprozess für neue Produkte und Informationssysteme etabliert. Weiterhin führen wir intensive und obligatorische digitale Sicherheitstests durch. Wir streben weiterhin an, durch IT - Sicherheitslösungen unser Wachstum zu beschleunigen. Dazu haben wir Sicherheitsab - teilungen unter dem Dach der Deutschen Telekom Security zusammengefasst. Mit diesem End - to - End - Sicherheitsportfolio wollen wir Marktanteile gewinnen und im Zuge der Megatrends Internet der Dinge und Industrie 4. 0 mit neuen Sicherheits - konzepten punkten. Außerdem bauen wir unser Partner - Ökosystem im Bereich der Cybersicherheit sukzessive weiter aus. Auf unserer Website berichten wir fortlaufend über aktuelle Entwicklungen in puncto Datenschutz und Datensicherheit. Aufgrund der Zunahme von erfolgreichen Cyberattacken gegen die Deutsche Telekom in den letzten Jahren, v. a. in den USA, und der generell zunehmenden Bedrohungslage durch Cyberang
|
| 256 |
+
|
| 257 |
+
-----
|
| 258 |
+
|
| 259 |
+
und - anwendungen zu unterstützen. Bei Energy Web nutzen Energiekonzerne und IT - Dienstleister die Blockchain - Technologie zur Beschleunigung der Energiewende. Lösungen auf der Plattform sollen ein dezentrales, digitalisiertes und dekarbonisiertes Energiesystem ermöglichen. MultiversX hat eine hoch skalierbare, effiziente und sichere Blockchain - Architektur geschaffen, die u. a. Web3 - und Metaverse - Anwendungen unter - stützt ; MMS dient dabei als Validator, der Transaktionen überprüft, Blöcke erstellt und das Netzwerk vor Angriffen schützt. An unsere Aktionäre Zusammengefasster Lagebericht Konzernabschluss Konzernanhang Weitere Informationen q ) / = Deutsche Telekom. Das Geschäftsjahr 2023. 28 Business Automation nach europäischen Standards. T - Systems und UiPath sind eine Partnerschaft eingegangen, um Hyperau - tomation in der Open Telekom Cloud zunächst für die DACH - Region und die Branchen Public, Healthcare und Public Transport zu ermöglichen. Anhand des KI - gestützten Plattform - Portfolios von UiPath mit Anwendungen wie Machine Learning, Process und Document Understanding können ganze Prozesse von Unternehmen End - to - End automatisiert und digitalisiert werden. Die Daten werden in europäischen Rechenzentren gespeichert und werden nur mit Zugriff aus dem europäischen Raum heraus gewartet und
|
| 260 |
+
|
| 261 |
+
analyse der Cyberbedrohungserkennung und Schutzstrategie der Deutschen Telekom
|
| 262 |
+
|
| 263 |
+
go ahead and answer!
|
| 264 |
+
|
| 265 |
+
assistant: Die Deutsche Telekom begegnet Cyber-Bedrohungen mit einem ganzheitlichen Sicherheitsansatz und hat hierzu umfassende Strukturen und Prozesse etabliert:
|
| 266 |
+
Die Deutsche Telekom setzt umfangreiche Mitigierungsmaßnahmen ein, darunter ausgearbeitete Sicherheitskonzepte, um Angriffe frühzeitig abzuwehren.
|
| 267 |
+
Zur Erhöhung der Transparenz und Effektivität kooperiert sie verstärkt mit öffentlichen und privaten Organisationen.
|
| 268 |
+
Sicherheit ist durch das „Security by Design“-Prinzip fester Bestandteil im Entwicklungsprozess neuer Produkte und Informationssysteme.
|
| 269 |
+
Es werden intensive und obligatorische digitale Sicherheitstests durchgeführt, um Schwachstellen systematisch aufzudecken.
|
| 270 |
+
Alle sicherheitsrelevanten Abteilungen wurden unter dem Dach der Deutschen Telekom Security zusammengeführt. Mit diesem End-to-End-Sicherheitsportfolio zielt das Unternehmen darauf ab, Marktanteile zu gewinnen und im Rahmen der Megatrends Internet der Dinge und Industrie 4.0 neue Sicherheitskonzepte zu etablieren.
|
| 271 |
+
Zudem wird das Partner-Ökosystem im Bereich Cybersicherheit kontinuierlich ausgebaut, und auf der Unternehmenswebsite wird fortlaufend über aktuelle Entwicklungen in Datenschutz und Datensicherheit berichtet.
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
=====
|
| 275 |
+
|
| 276 |
+
user:
|
| 277 |
+
q: What is the distance between sun & moon?
|
| 278 |
+
|
| 279 |
+
ctx:
|
| 280 |
+
----
|
| 281 |
+
|
| 282 |
+
MINI ensures that the energy consumed during vehicle charging is offset by feeding an equivalent amount of certified green electricity back into the grid, while also working toward integrating EVs into the energy system. At the same time, BMW is advancing hydrogen technology by developing fuel cell vehicles, expanding hydrogen infrastructure, and collaborating globally to accelerate low-emission mobility.
|
| 283 |
+
|
| 284 |
+
go ahead and answer!
|
| 285 |
+
|
| 286 |
+
assistant: I don't know.
|
| 287 |
+
|
| 288 |
+
=====
|
| 289 |
+
|
| 290 |
+
user:
|
| 291 |
+
q: Welche Akquisitionen tätigt die Deutsche Telekom?
|
| 292 |
+
|
| 293 |
+
ctx:
|
| 294 |
+
----
|
| 295 |
+
|
| 296 |
+
'2023 beträgt 3, 5 Mio. € ( zuzüglich Umsatzsteuer ) ( 2022 : 3, 6 Mio. € ( zuzüglich Umsatzsteuer ) ) und setzt sich zusammen aus festen jährlichen Vergütungen und Sitzungsgeldern. Sonstiges Die Gesellschaft hat keine Vorschüsse und Kredite an Vorstandsmitglieder bzw. ehemalige Vorstandsmitglieder oder an Aufsichtsratsmitglieder bzw. ehemalige Aufsichtsratsmitglieder gewährt. Ebenso wurden keine Haftungsverhältnisse zugunsten dieses Personenkreises eingegangen. Ausführliche Informationen zur Vergütung von Vorstand und Aufsichtsrat werden im separaten Vergütungsbericht veröffentlicht. 47 Anteilsbasierte Vergütung Share Matching Plan Die Mitglieder des Vorstands sind vertraglich dazu verpflichtet, ein Drittel des vom Aufsichtsrat festgesetzten Short Term Incen - tive ( STI ) in Aktien der Deutschen Telekom AG zu investieren. Es besteht die freiwillige Möglichkeit, das Investitionsvolumen bis auf 50 % des STI zu erhöhen. Die Deutsche Telekom AG wird für jede im Rahmen des zuvor beschriebenen Eigeninvestments des Vorstands erworbene Aktie nach Ablauf der vierjährigen Haltefrist ab dem Kaufdatum eine zusätzliche Aktie übertragen ( Share Matching Plan, SMP ). Die Funktion
|
| 297 |
+
|
| 298 |
+
go ahead and answer!
|
| 299 |
+
|
| 300 |
+
assistant: Ich weiß es nicht.
|
| 301 |
+
|
| 302 |
+
-----
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
Think hard and carefully before answering. Your reponse must be a strict subset of the context `ctx`.
|
| 306 |
+
|
| 307 |
+
user:
|