Spaces:

sivan22
/

Ituria

Running

App Files Files Community

sivan22 commited on Dec 13, 2024

Commit

68b3376

verified ·

1 Parent(s): 6cd9eb9

Delete tantivy_search_agent.py

Browse files

Files changed (1) hide show

tantivy_search_agent.py +0 -165

tantivy_search_agent.py DELETED Viewed

@@ -1,165 +0,0 @@
-from typing import List, Dict, Any, Optional
-from tantivy import Index
-import logging
-import os
-import re
-class TantivySearchAgent:
-    def __init__(self, index_path: str):
-        """Initialize the Tantivy search agent with the index path"""
-        self.index_path = index_path
-        self.logger = logging.getLogger(__name__)
-        try:
-            self.index = Index.open(index_path)
-            self.logger.info(f"Successfully opened Tantivy index at {index_path}")
-        except Exception as e:
-            self.logger.error(f"Failed to open Tantivy index: {e}")
-            raise
-    def get_query_instructions(self) -> str:
-        """Return instructions for the LLM on how to parse and construct Tantivy queries"""
-        return """
-Instructions for generating a query:
-1. Boolean Operators:
-   - AND: term1 AND term2 (both required)
-   - OR: term1 OR term2 (either term)
-   - Multiple words default to OR operation (cloud network = cloud OR network)
-   - AND takes precedence over OR
-   - Example: Shabath AND (walk OR go)
-2. Field-specific Terms:
-   - Field-specific terms: field:term
-   - Example: text:אדם AND reference:בראשית
-   - available fields: text, reference, topics
-   - text contains the text of the document
-   - reference contains the citation of the document, e.g. בראשית, פרק א
-   - topics contains the topics of the document. available topics includes: תנך, הלכה, מדרש, etc.
-3. Required/Excluded Terms:
-   - Required (+): +term (must contain)
-   - Excluded (-): -term (must not contain)
-   - Example: +security cloud -deprecated
-   - Equivalent to: security AND cloud AND NOT deprecated
-4. Phrase Search:
-   - Use quotes: "exact phrase"
-   - Both single/double quotes work
-   - Escape quotes with \\"
-   - Slop operator: "term1 term2"~N
-   - Example: "cloud security"~2
-   - the above will find "cloud framework and security "
-   - Prefix matching: "start of phrase"*
-5. Wildcards:
-   - ? for single character
-   - * for any number of characters
-   - Example: sec?rity cloud*
-6. Special Features:
-   - All docs: *
-   - Boost terms: term^2.0 (positive numbers only)
-   - Example: security^2.0 cloud
-   - the above will boost security by 2.0
-Query Examples:
-1. Basic: +שבת +חולה +אסור
-2. Field-specific: text:סיני AND topics:תנך
-3. Phrase with slop: "security framework"~2
-4. Complex: +reference:בראשית +text:"הבל"^2.0 +(דמי OR דמים) -הבלים
-6. Mixed: (text:"רבנו משה"^2.0 OR reference:"משנה תורה") AND topics:הלכה) AND text:"תורה המלך"~3 AND NOT topics:מדרש
-Tips:
-- Group complex expressions with parentheses
-- Use quotes for exact phrases
-- Add + for required terms, - for excluded terms
-- Boost important terms with ^N
-- use field-specific terms for better results.
-"""
-    def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]:
-        """Search the Tantivy index with the given query using Tantivy's query syntax"""
-        try:
-            # Create a searcher
-            searcher = self.index.searcher()
-            # Parse and execute the query
-            try:
-                # First try with lenient parsing
-                query_parser = self.index.parse_query_lenient(query)
-                search_results = searcher.search(query_parser[0], num_results).hits
-            except Exception as query_error:
-                self.logger.error(f"Lenient query parsing failed: {query_error}")
-            # Process results
-            results = []
-            for score, doc_address in search_results:
-                doc = searcher.doc(doc_address)
-                text = doc.get_first("text")
-                # Extract highlighted snippets based on query terms
-                # Remove special syntax for highlighting while preserving Hebrew
-                highlight_terms = re.sub(
-                    r'[:"()[\]{}^~*\\]|\b(AND|OR|NOT|TO|IN)\b|[-+]',
-                    ' ',
-                    query
-                ).strip()
-                highlight_terms = [term for term in highlight_terms.split() if len(term) > 1]
-                # Create regex pattern for highlighting
-                if highlight_terms:
-                    # Escape regex special chars but preserve Hebrew
-                    patterns = [re.escape(term) for term in highlight_terms]
-                    pattern = '|'.join(patterns)
-                    # Get surrounding context for matches
-                    matches = list(re.finditer(pattern, text, re.IGNORECASE))
-                    if matches:
-                        highlights = []
-                        for match in matches:
-                            start = max(0, match.start() - 50)
-                            end = min(len(text), match.end() + 50)
-                            highlight = text[start:end]
-                            if start > 0:
-                                highlight = f"...{highlight}"
-                            if end < len(text):
-                                highlight = f"{highlight}..."
-                            highlights.append(highlight)
-                    else:
-                        highlights = [text[:100] + "..." if len(text) > 100 else text]
-                else:
-                    highlights = [text[:100] + "..." if len(text) > 100 else text]
-                result = {
-                    "score": float(score),
-                    "title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""),
-                    "reference": doc.get_first("reference"),
-                    "topics": doc.get_first("topics"),
-                    "file_path": doc.get_first("filePath"),
-                    "line_number": doc.get_first("segment"),
-                    "is_pdf": doc.get_first("isPdf"),
-                    "text": text,
-                    "highlights": highlights
-                }
-                results.append(result)
-            self.logger.info(f"Found {len(results)} results for query: {query}")
-            return results
-        except Exception as e:
-            self.logger.error(f"Error during search: {str(e)}")
-            return []
-    def validate_index(self) -> bool:
-        """Validate that the index exists and is accessible"""
-        try:
-            # Try to create a searcher and perform a simple search
-            searcher = self.index.searcher()
-            query_parser = self.index.parse_query("*")
-            searcher.search(query_parser, 1)
-            return True
-        except Exception as e:
-            self.logger.error(f"Index validation failed: {e}")
-            return False