Spaces:

sivan22
/

Ituria

Running

App Files Files Community

sivan22 commited on Dec 13, 2024

Commit

f6c0c6c

verified ·

1 Parent(s): 68b3376

Upload tantivy_search_agent.py

Browse files

Files changed (1) hide show

tantivy_search_agent.py +165 -0

tantivy_search_agent.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from typing import List, Dict, Any, Optional
+from tantivy import Index
+import logging
+import os
+import re
+class TantivySearchAgent:
+    def __init__(self, index_path: str):
+        """Initialize the Tantivy search agent with the index path"""
+        self.index_path = index_path
+        self.logger = logging.getLogger(__name__)
+        try:
+            self.index = Index.open(index_path)
+            self.logger.info(f"Successfully opened Tantivy index at {index_path}")
+        except Exception as e:
+            self.logger.error(f"Failed to open Tantivy index: {e}")
+            raise
+    def get_query_instructions(self) -> str:
+        """Return instructions for the LLM on how to parse and construct Tantivy queries"""
+        return """
+Instructions for generating a query:
+1. Boolean Operators:
+   - AND: term1 AND term2 (both required)
+   - OR: term1 OR term2 (either term)
+   - Multiple words default to OR operation (cloud network = cloud OR network)
+   - AND takes precedence over OR
+   - Example: Shabath AND (walk OR go)
+2. Field-specific Terms:
+   - Field-specific terms: field:term
+   - Example: text:אדם AND reference:בראשית
+   - available fields: text, reference, topics
+   - text contains the text of the document
+   - reference contains the citation of the document, e.g. בראשית, פרק א
+   - topics contains the topics of the document. available topics includes: תנך, הלכה, מדרש, etc.
+3. Required/Excluded Terms:
+   - Required (+): +term (must contain)
+   - Excluded (-): -term (must not contain)
+   - Example: +security cloud -deprecated
+   - Equivalent to: security AND cloud AND NOT deprecated
+4. Phrase Search:
+   - Use quotes: "exact phrase"
+   - Both single/double quotes work
+   - Escape quotes with \\"
+   - Slop operator: "term1 term2"~N
+   - Example: "cloud security"~2
+   - the above will find "cloud framework and security "
+   - Prefix matching: "start of phrase"*
+5. Wildcards:
+   - ? for single character
+   - * for any number of characters
+   - Example: sec?rity cloud*
+6. Special Features:
+   - All docs: *
+   - Boost terms: term^2.0 (positive numbers only)
+   - Example: security^2.0 cloud
+   - the above will boost security by 2.0
+Query Examples:
+1. Basic: +שבת +חולה +אסור
+2. Field-specific: text:סיני AND topics:תנך
+3. Phrase with slop: "security framework"~2
+4. Complex: +reference:בראשית +text:"הבל"^2.0 +(דמי OR דמים) -הבלים
+6. Mixed: (text:"רבנו משה"^2.0 OR reference:"משנה תורה") AND topics:הלכה) AND text:"תורה המלך"~3 AND NOT topics:מדרש
+Tips:
+- Group complex expressions with parentheses
+- Use quotes for exact phrases
+- Add + for required terms, - for excluded terms
+- Boost important terms with ^N
+- use field-specific terms for better results.
+"""
+    def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]:
+        """Search the Tantivy index with the given query using Tantivy's query syntax"""
+        try:
+            # Create a searcher
+            searcher = self.index.searcher()
+            # Parse and execute the query
+            try:
+                # First try with lenient parsing
+                query_parser = self.index.parse_query_lenient(query)
+                search_results = searcher.search(query_parser[0], num_results).hits
+            except Exception as query_error:
+                self.logger.error(f"Lenient query parsing failed: {query_error}")
+            # Process results
+            results = []
+            for score, doc_address in search_results:
+                doc = searcher.doc(doc_address)
+                text = doc.get_first("text")
+                # Extract highlighted snippets based on query terms
+                # Remove special syntax for highlighting while preserving Hebrew
+                highlight_terms = re.sub(
+                    r'[:"()[\]{}^~*\\]|\b(AND|OR|NOT|TO|IN)\b|[-+]',
+                    ' ',
+                    query
+                ).strip()
+                highlight_terms = [term for term in highlight_terms.split() if len(term) > 1]
+                # Create regex pattern for highlighting
+                if highlight_terms:
+                    # Escape regex special chars but preserve Hebrew
+                    patterns = [re.escape(term) for term in highlight_terms]
+                    pattern = '|'.join(patterns)
+                    # Get surrounding context for matches
+                    matches = list(re.finditer(pattern, text, re.IGNORECASE))
+                    if matches:
+                        highlights = []
+                        for match in matches:
+                            start = max(0, match.start() - 50)
+                            end = min(len(text), match.end() + 50)
+                            highlight = text[start:end]
+                            if start > 0:
+                                highlight = f"...{highlight}"
+                            if end < len(text):
+                                highlight = f"{highlight}..."
+                            highlights.append(highlight)
+                    else:
+                        highlights = [text[:100] + "..." if len(text) > 100 else text]
+                else:
+                    highlights = [text[:100] + "..." if len(text) > 100 else text]
+                result = {
+                    "score": float(score),
+                    "title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""),
+                    "reference": doc.get_first("reference"),
+                    "topics": doc.get_first("topics"),
+                    "file_path": doc.get_first("filePath"),
+                    "line_number": doc.get_first("segment"),
+                    "is_pdf": doc.get_first("isPdf"),
+                    "text": text,
+                    "highlights": highlights
+                }
+                results.append(result)
+            self.logger.info(f"Found {len(results)} results for query: {query}")
+            return results
+        except Exception as e:
+            self.logger.error(f"Error during search: {str(e)}")
+            return []
+    def validate_index(self) -> bool:
+        """Validate that the index exists and is accessible"""
+        try:
+            # Try to create a searcher and perform a simple search
+            searcher = self.index.searcher()
+            query_parser = self.index.parse_query("*")
+            searcher.search(query_parser, 1)
+            return True
+        except Exception as e:
+            self.logger.error(f"Index validation failed: {e}")
+            return False