Spaces:
Sleeping
Sleeping
| # app/modules/processing/query_processor.py | |
| import re | |
| from typing import List | |
| def normalize_query(query: str) -> str: | |
| """ | |
| Normalize a query string. | |
| - Trim whitespace | |
| - (Optional) Add rules for punctuation or casing if needed | |
| """ | |
| return query.strip() | |
| def deduplicate_queries(queries: List[str]) -> List[str]: | |
| """ | |
| Remove exact duplicate queries. | |
| """ | |
| seen = set() | |
| unique = [] | |
| for q in queries: | |
| if q not in seen: | |
| unique.append(q) | |
| seen.add(q) | |
| return unique | |
| def select_top_k_queries(queries: List[str], reranker, k: int = 5) -> List[str]: | |
| """ | |
| Select top-k queries using a reranker. | |
| - Keep broad coverage while reducing redundant API calls | |
| - Reranker should be provided externally (function/class) | |
| """ | |
| if not queries: | |
| return [] | |
| scored = reranker(queries) # expected: List of (query, score) | |
| scored.sort(key=lambda x: x[1], reverse=True) | |
| return [q for q, _ in scored[:k]] | |
| def process_queries(raw_output: str, max_queries: int = 5) -> List[str]: | |
| """ | |
| Entry point: process raw LLM output into final queries. | |
| - Split by line | |
| - Remove leading numbers or list markers | |
| - Normalize each query | |
| - Deduplicate identical queries | |
| - Limit to max_queries | |
| """ | |
| queries = [] | |
| for line in raw_output.splitlines(): | |
| q = line.strip() | |
| q = re.sub(r"^[\d\.\-\)\s]+", "", q) # remove leading numbering | |
| q = normalize_query(q) | |
| if q: | |
| queries.append(q) | |
| if len(queries) >= max_queries: | |
| break | |
| queries = deduplicate_queries(queries) | |
| return queries | |