Spaces:

renatavl
/

RAG

Running

App Files Files Community

RAG / app.py

renatavl

init

860ef8a about 1 month ago

raw

history blame contribute delete

14.3 kB

	import os
	import re
	import ast
	import threading
	from dataclasses import dataclass
	from typing import List, Tuple, Optional, Dict, Any
	from itertools import islice

	import numpy as np
	import gradio as gr
	from rank_bm25 import BM25Okapi
	from sentence_transformers import SentenceTransformer, CrossEncoder
	from litellm import completion
	from datasets import load_dataset


	# -----------------------------
	# Config
	# -----------------------------
	HF_DATASET_NAME = "CodeKapital/CookingRecipes"

	DENSE_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
	RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"

	CHUNK_SIZE_WORDS = 350
	CHUNK_OVERLAP_WORDS = 60

	TOPK_BM25 = 25
	TOPK_DENSE = 25
	TOPK_AFTER_RERANK = 6

	OLLAMA_BASE_URL = "http://localhost:11434" # локальний Ollama

	DEFAULT_N_RECORDS = 500


	# -----------------------------
	# Data structures
	# -----------------------------
	@dataclass
	class Chunk:
	chunk_id: str
	source: str
	text: str


	# -----------------------------
	# Preprocessing + chunking
	# -----------------------------
	_whitespace_re = re.compile(r"\s+")
	_token_re = re.compile(r"[A-Za-zА-Яа-яІіЇїЄє0-9]+")


	def normalize_text(text: str) -> str:
	text = (text or "").replace("\u00a0", " ")
	text = _whitespace_re.sub(" ", text).strip()
	return text


	def tokenize_for_bm25(text: str) -> List[str]:
	return [t.lower() for t in _token_re.findall(text or "")]


	def chunk_text(
	source: str,
	text: str,
	chunk_size_words: int = CHUNK_SIZE_WORDS,
	overlap_words: int = CHUNK_OVERLAP_WORDS
	) -> List[Chunk]:
	"""Чанкання по словам з overlap."""
	words = (text or "").split()
	if not words:
	return []

	chunks: List[Chunk] = []
	start = 0
	idx = 0

	while start < len(words):
	end = min(start + chunk_size_words, len(words))
	chunk_str = " ".join(words[start:end]).strip()

	if chunk_str:
	chunks.append(Chunk(
	chunk_id=f"{source}::chunk{idx}",
	source=source,
	text=chunk_str
	))
	idx += 1

	if end == len(words):
	break
	start = max(0, end - overlap_words)

	return chunks


	# -----------------------------
	# HF dataset helpers
	# -----------------------------
	def _to_list(x: Any) -> List[str]:
	"""ingredients/directions можуть бути list або строкою зі списком."""
	if x is None:
	return []
	if isinstance(x, list):
	return [str(i).strip() for i in x if str(i).strip()]
	if isinstance(x, str):
	s = x.strip()
	if not s:
	return []
	try:
	v = ast.literal_eval(s)
	if isinstance(v, list):
	return [str(i).strip() for i in v if str(i).strip()]
	except Exception:
	pass
	if "\n" in s:
	parts = [p.strip(" -•\t") for p in s.splitlines()]
	else:
	parts = [p.strip() for p in s.split(",")]
	return [p for p in parts if p]
	return [str(x).strip()] if str(x).strip() else []


	def recipe_row_to_doc(row: Dict[str, Any], idx: int) -> Tuple[str, str]:
	"""Повертає (source_name, full_text) для одного рецепта."""
	title = (row.get("title") or "").strip()
	link = (row.get("link") or "").strip()
	src = (row.get("source") or "").strip()

	ingredients = _to_list(row.get("ingredients"))
	directions = _to_list(row.get("directions"))

	safe_title = title[:80].replace("\n", " ").strip()
	source_name = f"CookingRecipes#{idx}"
	if safe_title:
	source_name += f" \| {safe_title}"
	if link:
	source_name += f" \| {link}"

	parts = []
	parts.append(f"Title: {title or '(unknown)'}")
	if src:
	parts.append(f"Source: {src}")
	if link:
	parts.append(f"Link: {link}")

	if ingredients:
	parts.append("Ingredients:\n" + "\n".join(f"- {i}" for i in ingredients))
	if directions:
	parts.append("Directions:\n" + "\n".join(f"{i+1}. {d}" for i, d in enumerate(directions)))

	full_text = normalize_text("\n\n".join(parts))
	return source_name, full_text


	def load_first_n_recipes(n: int, streaming: bool = True) -> List[Tuple[str, str]]:
	n = int(max(0, n))
	if n == 0:
	return []

	if streaming:
	ds = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
	iterator = islice(ds, n)
	else:
	ds = load_dataset(HF_DATASET_NAME, split=f"train[:{n}]")
	iterator = ds

	docs: List[Tuple[str, str]] = []
	for idx, row in enumerate(iterator):
	source_name, text = recipe_row_to_doc(row, idx)
	if text.strip():
	docs.append((source_name, text))
	return docs


	# -----------------------------
	# RAG Engine
	# -----------------------------
	class RAGEngine:
	def __init__(self):
	self.chunks: List[Chunk] = []
	self.bm25: Optional[BM25Okapi] = None
	self.bm25_corpus_tokens: List[List[str]] = []

	self.dense_model: Optional[SentenceTransformer] = None
	self.rerank_model: Optional[CrossEncoder] = None
	self.chunk_embeddings: Optional[np.ndarray] = None

	self.last_build_info: str = "Index not built yet."

	def ensure_models(self) -> None:
	if self.dense_model is None:
	self.dense_model = SentenceTransformer(DENSE_MODEL_NAME)
	if self.rerank_model is None:
	self.rerank_model = CrossEncoder(RERANK_MODEL_NAME)

	def build_from_dataset(self, n_records: int, streaming: bool) -> None:
	docs = load_first_n_recipes(n_records, streaming=streaming)

	all_chunks: List[Chunk] = []
	for source, text in docs:
	all_chunks.extend(chunk_text(source, text))

	self.chunks = all_chunks

	if not self.chunks:
	self.bm25 = None
	self.chunk_embeddings = None
	self.last_build_info = "No chunks built (N too small or empty rows)."
	return

	# Models
	self.ensure_models()

	# BM25
	self.bm25_corpus_tokens = [tokenize_for_bm25(c.text) for c in self.chunks]
	self.bm25 = BM25Okapi(self.bm25_corpus_tokens)

	# Dense embeddings
	embs = self.dense_model.encode(
	[c.text for c in self.chunks],
	batch_size=64,
	show_progress_bar=True,
	normalize_embeddings=True
	)
	self.chunk_embeddings = np.asarray(embs, dtype=np.float32)

	self.last_build_info = (
	f"Built index from {len(docs)} recipes → {len(self.chunks)} chunks. "
	f"Streaming={streaming}."
	)

	def retrieve_candidates(
	self,
	query: str,
	use_bm25: bool,
	use_dense: bool,
	topk_bm25: int = TOPK_BM25,
	topk_dense: int = TOPK_DENSE
	) -> List[int]:
	if not self.chunks:
	return []

	candidate_ids = set()

	if use_bm25 and self.bm25 is not None:
	q_tokens = tokenize_for_bm25(query)
	scores = self.bm25.get_scores(q_tokens)
	top_idx = np.argsort(scores)[::-1][:int(topk_bm25)]
	candidate_ids.update(top_idx.tolist())

	if use_dense and self.dense_model is not None and self.chunk_embeddings is not None:
	q_emb = self.dense_model.encode([query], normalize_embeddings=True)
	q_emb = np.asarray(q_emb, dtype=np.float32)[0]
	sims = self.chunk_embeddings @ q_emb
	top_idx = np.argsort(sims)[::-1][:int(topk_dense)]
	candidate_ids.update(top_idx.tolist())

	return list(candidate_ids)

	def rerank(self, query: str, candidate_idx: List[int], top_n: int = TOPK_AFTER_RERANK) -> List[int]:
	if not candidate_idx:
	return []
	if self.rerank_model is None:
	return candidate_idx[:int(top_n)]

	pairs = [(query, self.chunks[i].text) for i in candidate_idx]
	scores = self.rerank_model.predict(pairs)
	order = np.argsort(scores)[::-1]
	return [candidate_idx[i] for i in order[:int(top_n)]]

	def build_context(self, selected_idx: List[int]) -> str:
	blocks = []
	for j, i in enumerate(selected_idx, start=1):
	c = self.chunks[i]
	blocks.append(
	f"[{j}] Source: {c.source} \| {c.chunk_id}\n{c.text}"
	)
	return "\n\n---\n\n".join(blocks)

	def answer_with_llm(self, query: str, context: str, model: str, api_key: str, temperature: float = 0.2) -> str:
	model = (model or "").strip()
	api_key = (api_key or "").strip()
	if not model:
	return "Model is empty."

	if model.startswith("openai/") or model.startswith("gpt-"):
	if api_key:
	os.environ["OPENAI_API_KEY"] = api_key
	elif model.startswith("openrouter/"):
	if api_key:
	os.environ["OPENROUTER_API_KEY"] = api_key
	elif model.startswith("groq/"):
	if api_key:
	os.environ["GROQ_API_KEY"] = api_key

	system = (
	"You are a helpful QA assistant.\n"
	"Answer the user's question using ONLY the provided context.\n"
	"If the answer is not in the context, say you don't know.\n"
	"When you use facts from the context, add citations like [1] referring to the chunk numbers."
	)
	user = f"Question: {query}\n\nContext:\n{context}"

	extra = {}
	if model.startswith("ollama/"):
	extra["api_base"] = OLLAMA_BASE_URL

	resp = completion(
	model=model,
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	temperature=temperature,
	api_key=api_key if api_key else None,
	**extra
	)
	return resp["choices"][0]["message"]["content"]


	# -----------------------------
	# Global engine + lock
	# -----------------------------
	ENGINE = RAGEngine()
	ENGINE_LOCK = threading.Lock()

	# build once on startup
	with ENGINE_LOCK:
	ENGINE.build_from_dataset(DEFAULT_N_RECORDS, streaming=True)


	# -----------------------------
	# Gradio UI callbacks
	# -----------------------------
	def rebuild_index(n_records: int, streaming: bool) -> str:
	with ENGINE_LOCK:
	ENGINE.build_from_dataset(int(n_records), bool(streaming))
	return ENGINE.last_build_info


	def qa(
	question: str,
	use_bm25: bool,
	use_dense: bool,
	use_rerank: bool,
	model: str,
	api_key: str,
	topk_bm25: int,
	topk_dense: int,
	topk_final: int
	):
	question = (question or "").strip()
	if not question:
	return "Type a question.", ""

	if not use_bm25 and not use_dense:
	return "Enable BM25 and/or Dense retrieval (otherwise there is no context).", ""

	with ENGINE_LOCK:
	if not ENGINE.chunks:
	return "Index is empty. Click 'Rebuild index' with N>0.", ""

	cands = ENGINE.retrieve_candidates(
	question,
	use_bm25=use_bm25,
	use_dense=use_dense,
	topk_bm25=int(topk_bm25),
	topk_dense=int(topk_dense)
	)
	if not cands:
	return "No candidates retrieved.", ""

	if use_rerank:
	selected = ENGINE.rerank(question, cands, top_n=int(topk_final))
	else:
	selected = cands[:int(topk_final)]

	context = ENGINE.build_context(selected)

	try:
	answer = ENGINE.answer_with_llm(question, context, model=model, api_key=api_key)
	except Exception as e:
	answer = f"LLM call failed: {type(e).__name__}: {e}"

	return answer, context


	# -----------------------------
	# Launch UI
	# -----------------------------
	def build_demo() -> gr.Blocks:
	with gr.Blocks(title="RAG QA on CookingRecipes (BM25 + Dense + Rerank)") as demo:
	gr.Markdown(
	"# RAG QA (CookingRecipes)\n"
	f"Dataset: `{HF_DATASET_NAME}`. Індексуємо перші N рецептів.\n\n"
	)

	with gr.Row():
	n_records = gr.Slider(50, 5000, value=DEFAULT_N_RECORDS, step=50, label="N recipes to index (first N)")
	streaming = gr.Checkbox(value=True, label="Use streaming (recommended)")

	build_btn = gr.Button("Rebuild index")
	build_status = gr.Markdown(value=f"Status: {ENGINE.last_build_info}")

	build_btn.click(fn=rebuild_index, inputs=[n_records, streaming], outputs=[build_status])

	gr.Markdown("---")

	with gr.Row():
	question = gr.Textbox(label="Question", placeholder="Ask about recipes...", lines=2)

	with gr.Row():
	use_bm25 = gr.Checkbox(value=True, label="Use BM25 (keyword)")
	use_dense = gr.Checkbox(value=True, label="Use Dense (embeddings)")
	use_rerank = gr.Checkbox(value=True, label="Use Cross-Encoder Reranker")

	with gr.Row():
	model = gr.Textbox(
	label="LLM model (LiteLLM)",
	value="openai/gpt-4o-mini",
	placeholder="e.g. openai/gpt-4o-mini OR groq/... OR openrouter/..."
	)
	api_key = gr.Textbox(
	label="API key (leave empty for Ollama)",
	placeholder="Empty for local ollama",
	type="password"
	)

	with gr.Row():
	topk_bm25 = gr.Slider(5, 80, value=TOPK_BM25, step=1, label="Top-K BM25 candidates")
	topk_dense = gr.Slider(5, 80, value=TOPK_DENSE, step=1, label="Top-K Dense candidates")
	topk_final = gr.Slider(1, 12, value=TOPK_AFTER_RERANK, step=1, label="Chunks to LLM (final)")

	run_btn = gr.Button("Answer")

	answer = gr.Markdown(label="Answer")
	context = gr.Textbox(label="Retrieved context (debug)", lines=16)

	run_btn.click(
	fn=qa,
	inputs=[question, use_bm25, use_dense, use_rerank, model, api_key, topk_bm25, topk_dense, topk_final],
	outputs=[answer, context]
	)

	return demo


	if __name__ == "__main__":
	demo = build_demo()
	demo.launch()
	# for local run with fixed port:
	# demo.launch(server_name="127.0.0.1", server_port=7860)