Spaces:
Sleeping
Sleeping
soft.engineer commited on
Commit ·
58259d1
1
Parent(s): 6a97c2a
add setting tab
Browse files- app.py +76 -1
- core/index.py +40 -22
- core/ingest.py +4 -4
app.py
CHANGED
|
@@ -524,7 +524,17 @@ def run_evaluation(queries_json: str, output_filename: str) -> Tuple[str, pd.Dat
|
|
| 524 |
initialize_system()
|
| 525 |
|
| 526 |
# Create Gradio interface
|
| 527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
gr.Markdown("# RAG Evaluation System: Hierarchical vs Standard RAG")
|
| 529 |
|
| 530 |
with gr.Tab("Upload Documents"):
|
|
@@ -662,6 +672,71 @@ with gr.Blocks(title="RAG Evaluation System") as demo:
|
|
| 662 |
width=600,
|
| 663 |
height=400
|
| 664 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
|
| 666 |
# Event handlers
|
| 667 |
build_btn.click(
|
|
|
|
| 524 |
initialize_system()
|
| 525 |
|
| 526 |
# Create Gradio interface
|
| 527 |
+
# Minimal CSS to keep layout stable when vertical scrollbar appears and improve mobile spacing
|
| 528 |
+
APP_CSS = """
|
| 529 |
+
html, body { scrollbar-gutter: stable both-edges; }
|
| 530 |
+
body { overflow-y: scroll; }
|
| 531 |
+
* { box-sizing: border-box; }
|
| 532 |
+
@media (max-width: 768px) {
|
| 533 |
+
.gradio-container { padding-left: 8px; padding-right: 8px; }
|
| 534 |
+
}
|
| 535 |
+
"""
|
| 536 |
+
|
| 537 |
+
with gr.Blocks(title="RAG Evaluation System", css=APP_CSS) as demo:
|
| 538 |
gr.Markdown("# RAG Evaluation System: Hierarchical vs Standard RAG")
|
| 539 |
|
| 540 |
with gr.Tab("Upload Documents"):
|
|
|
|
| 672 |
width=600,
|
| 673 |
height=400
|
| 674 |
)
|
| 675 |
+
with gr.Tab("Settings"):
|
| 676 |
+
gr.Markdown("## Settings")
|
| 677 |
+
gr.Markdown("Configure embedding models and system preferences.")
|
| 678 |
+
|
| 679 |
+
with gr.Accordion("Embedding Configuration", open=True):
|
| 680 |
+
gr.Markdown("**Select the embedding provider and model.** Switching providers requires re-indexing your documents.")
|
| 681 |
+
|
| 682 |
+
with gr.Row():
|
| 683 |
+
with gr.Column():
|
| 684 |
+
emb_provider = gr.Radio(
|
| 685 |
+
choices=["SentenceTransformers", "OpenAI"],
|
| 686 |
+
value="SentenceTransformers",
|
| 687 |
+
label="Embeddings Provider",
|
| 688 |
+
info="Choose between local SentenceTransformers models or OpenAI embeddings (requires API key)"
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
with gr.Row():
|
| 692 |
+
apply_embed_btn = gr.Button("Apply Embedding Settings", variant="primary")
|
| 693 |
+
|
| 694 |
+
with gr.Row():
|
| 695 |
+
with gr.Column():
|
| 696 |
+
st_model_in = gr.Textbox(
|
| 697 |
+
label="SentenceTransformers Model",
|
| 698 |
+
value=os.getenv("ST_EMBED_MODEL", "all-MiniLM-L6-v2"),
|
| 699 |
+
interactive=False,
|
| 700 |
+
info="Local embedding model (384 dimensions)"
|
| 701 |
+
)
|
| 702 |
+
with gr.Column():
|
| 703 |
+
oai_model_in = gr.Textbox(
|
| 704 |
+
label="OpenAI Embedding Model",
|
| 705 |
+
value=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small"),
|
| 706 |
+
interactive=False,
|
| 707 |
+
info="OpenAI embedding model (1536 dimensions for small, 3072 for large)"
|
| 708 |
+
)
|
| 709 |
+
|
| 710 |
+
embed_status = gr.Textbox(
|
| 711 |
+
label="Status",
|
| 712 |
+
lines=3,
|
| 713 |
+
interactive=False,
|
| 714 |
+
placeholder="Embedding configuration status will appear here..."
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
# Define handler before wiring it
|
| 718 |
+
def _apply_embeddings(provider, st_model, oai_model):
|
| 719 |
+
try:
|
| 720 |
+
use_oai = (provider == "OpenAI")
|
| 721 |
+
rag_manager.vector_store.configure_embeddings(use_oai, openai_model=oai_model, st_model_name=st_model)
|
| 722 |
+
status_msg = f"✅ Embeddings successfully configured!\n\n"
|
| 723 |
+
status_msg += f"Provider: {provider}\n"
|
| 724 |
+
if use_oai:
|
| 725 |
+
status_msg += f"Model: {oai_model} (OpenAI)\n"
|
| 726 |
+
status_msg += f"Dimensions: {3072 if 'large' in oai_model.lower() else 1536}\n"
|
| 727 |
+
else:
|
| 728 |
+
status_msg += f"Model: {st_model} (SentenceTransformers)\n"
|
| 729 |
+
status_msg += f"Dimensions: ~384\n"
|
| 730 |
+
status_msg += f"\n⚠️ Note: If switching providers, reset and rebuild your index in the Upload tab."
|
| 731 |
+
return status_msg
|
| 732 |
+
except Exception as ex:
|
| 733 |
+
return f"❌ Failed to set embeddings: {ex}\n\nPlease check your configuration and try again."
|
| 734 |
+
|
| 735 |
+
apply_embed_btn.click(
|
| 736 |
+
fn=_apply_embeddings,
|
| 737 |
+
inputs=[emb_provider, st_model_in, oai_model_in],
|
| 738 |
+
outputs=embed_status
|
| 739 |
+
)
|
| 740 |
|
| 741 |
# Event handlers
|
| 742 |
build_btn.click(
|
core/index.py
CHANGED
|
@@ -26,28 +26,18 @@ class VectorStore:
|
|
| 26 |
os.makedirs(persist_directory, exist_ok=True, mode=0o755)
|
| 27 |
|
| 28 |
self.client = chromadb.PersistentClient(path=persist_directory)
|
| 29 |
-
#
|
| 30 |
-
self.use_openai =
|
| 31 |
-
if
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
if SentenceTransformer is None:
|
| 42 |
-
raise RuntimeError("SentenceTransformer not available and OpenAI embeddings not configured.")
|
| 43 |
-
self.st_model_name = os.getenv("ST_EMBED_MODEL", "all-MiniLM-L6-v2")
|
| 44 |
-
self.embedding_model = SentenceTransformer(self.st_model_name)
|
| 45 |
-
# Get model output dimension
|
| 46 |
-
try:
|
| 47 |
-
self.embed_dim = int(getattr(self.embedding_model, "get_sentence_embedding_dimension")())
|
| 48 |
-
except Exception:
|
| 49 |
-
# Fallback: compute once
|
| 50 |
-
self.embed_dim = len(self.embedding_model.encode("test"))
|
| 51 |
|
| 52 |
def _reopen_client(self, new_path: str):
|
| 53 |
os.makedirs(new_path, exist_ok=True, mode=0o755)
|
|
@@ -61,6 +51,34 @@ class VectorStore:
|
|
| 61 |
def _resolve_collection_name(self, base_name: str) -> str:
|
| 62 |
"""Ensure separate collections per embedding dimension/provider to avoid mismatch."""
|
| 63 |
return f"{base_name}__{self._collection_suffix()}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
def create_collection(self, name: str) -> chromadb.Collection:
|
| 66 |
"""Create or get collection, namespaced by embedding provider/dimension."""
|
|
|
|
| 26 |
os.makedirs(persist_directory, exist_ok=True, mode=0o755)
|
| 27 |
|
| 28 |
self.client = chromadb.PersistentClient(path=persist_directory)
|
| 29 |
+
# Default to SentenceTransformers; runtime switching handled via configure_embeddings()
|
| 30 |
+
self.use_openai = False
|
| 31 |
+
if SentenceTransformer is None:
|
| 32 |
+
raise RuntimeError("SentenceTransformers not available. Install sentence-transformers or switch to OpenAI via UI.")
|
| 33 |
+
self.st_model_name = os.getenv("ST_EMBED_MODEL", "all-MiniLM-L6-v2")
|
| 34 |
+
self.embedding_model = SentenceTransformer(self.st_model_name)
|
| 35 |
+
# Get model output dimension
|
| 36 |
+
try:
|
| 37 |
+
self.embed_dim = int(getattr(self.embedding_model, "get_sentence_embedding_dimension")())
|
| 38 |
+
except Exception:
|
| 39 |
+
# Fallback: compute once
|
| 40 |
+
self.embed_dim = len(self.embedding_model.encode("test"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
def _reopen_client(self, new_path: str):
|
| 43 |
os.makedirs(new_path, exist_ok=True, mode=0o755)
|
|
|
|
| 51 |
def _resolve_collection_name(self, base_name: str) -> str:
|
| 52 |
"""Ensure separate collections per embedding dimension/provider to avoid mismatch."""
|
| 53 |
return f"{base_name}__{self._collection_suffix()}"
|
| 54 |
+
|
| 55 |
+
def configure_embeddings(self, use_openai: bool, openai_model: Optional[str] = None, st_model_name: Optional[str] = None):
|
| 56 |
+
"""Reconfigure embedding backend at runtime.
|
| 57 |
+
Switching providers/dimensions implies a new collection suffix; existing data remains under old suffix.
|
| 58 |
+
"""
|
| 59 |
+
self.use_openai = bool(use_openai)
|
| 60 |
+
if self.use_openai:
|
| 61 |
+
# Check at call-time to avoid stale module-level flags
|
| 62 |
+
if not os.getenv("OPENAI_API_KEY"):
|
| 63 |
+
raise RuntimeError("OpenAI not available or API key missing.")
|
| 64 |
+
self.openai_client = _OpenAI()
|
| 65 |
+
self.openai_model = openai_model or os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
|
| 66 |
+
if self.openai_model == "text-embedding-3-large":
|
| 67 |
+
self.embed_dim = 3072
|
| 68 |
+
else:
|
| 69 |
+
self.embed_dim = 1536
|
| 70 |
+
else:
|
| 71 |
+
if SentenceTransformer is None:
|
| 72 |
+
raise RuntimeError("SentenceTransformer not available.")
|
| 73 |
+
name = st_model_name or os.getenv("ST_EMBED_MODEL", "all-MiniLM-L6-v2")
|
| 74 |
+
# Only reload if changed
|
| 75 |
+
if not hasattr(self, 'st_model_name') or self.st_model_name != name:
|
| 76 |
+
self.st_model_name = name
|
| 77 |
+
self.embedding_model = SentenceTransformer(self.st_model_name)
|
| 78 |
+
try:
|
| 79 |
+
self.embed_dim = int(getattr(self.embedding_model, "get_sentence_embedding_dimension")())
|
| 80 |
+
except Exception:
|
| 81 |
+
self.embed_dim = len(self.embedding_model.encode("test"))
|
| 82 |
|
| 83 |
def create_collection(self, name: str) -> chromadb.Collection:
|
| 84 |
"""Create or get collection, namespaced by embedding provider/dimension."""
|
core/ingest.py
CHANGED
|
@@ -106,7 +106,7 @@ class DocumentLoader:
|
|
| 106 |
except:
|
| 107 |
# If strict=False doesn't work, try normal reader
|
| 108 |
file.seek(0)
|
| 109 |
-
|
| 110 |
|
| 111 |
text = ""
|
| 112 |
for i, page in enumerate(reader.pages):
|
|
@@ -195,7 +195,7 @@ class DocumentChunker:
|
|
| 195 |
"""
|
| 196 |
loader = DocumentLoader()
|
| 197 |
content = loader.load_document(file_path)
|
| 198 |
-
|
| 199 |
# Auto-detect language if needed
|
| 200 |
if not language or str(language).lower() == 'auto':
|
| 201 |
# Prefer OpenAI if available
|
|
@@ -407,7 +407,7 @@ class DocumentChunker:
|
|
| 407 |
content=text_block,
|
| 408 |
metadata=final_md
|
| 409 |
))
|
| 410 |
-
|
| 411 |
return chunks
|
| 412 |
|
| 413 |
def _generate_metadata(self, file_path: str, hierarchy_def: Dict[str, Any],
|
|
@@ -415,7 +415,7 @@ class DocumentChunker:
|
|
| 415 |
"""Generate hierarchical metadata for chunk"""
|
| 416 |
# Simple rule-based classification with explicit label override
|
| 417 |
content_lower = content.lower()
|
| 418 |
-
|
| 419 |
# 1) Try to honor explicit labels like "Domain:", "Section:", "Topic:"
|
| 420 |
import re
|
| 421 |
explicit_l1 = explicit_l2 = explicit_l3 = None
|
|
|
|
| 106 |
except:
|
| 107 |
# If strict=False doesn't work, try normal reader
|
| 108 |
file.seek(0)
|
| 109 |
+
reader = PyPDF2.PdfReader(file)
|
| 110 |
|
| 111 |
text = ""
|
| 112 |
for i, page in enumerate(reader.pages):
|
|
|
|
| 195 |
"""
|
| 196 |
loader = DocumentLoader()
|
| 197 |
content = loader.load_document(file_path)
|
| 198 |
+
|
| 199 |
# Auto-detect language if needed
|
| 200 |
if not language or str(language).lower() == 'auto':
|
| 201 |
# Prefer OpenAI if available
|
|
|
|
| 407 |
content=text_block,
|
| 408 |
metadata=final_md
|
| 409 |
))
|
| 410 |
+
|
| 411 |
return chunks
|
| 412 |
|
| 413 |
def _generate_metadata(self, file_path: str, hierarchy_def: Dict[str, Any],
|
|
|
|
| 415 |
"""Generate hierarchical metadata for chunk"""
|
| 416 |
# Simple rule-based classification with explicit label override
|
| 417 |
content_lower = content.lower()
|
| 418 |
+
|
| 419 |
# 1) Try to honor explicit labels like "Domain:", "Section:", "Topic:"
|
| 420 |
import re
|
| 421 |
explicit_l1 = explicit_l2 = explicit_l3 = None
|