Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| from typing import Dict, Any, List, Optional | |
| def is_valid_query(query: str) -> bool: | |
| """ | |
| Validates if a search query is legitimate. | |
| Args: | |
| query: The search query to validate | |
| Returns: | |
| Boolean indicating if the query is valid | |
| """ | |
| # Reject empty queries | |
| if not query or query.strip() == "": | |
| return False | |
| # Reject single emoji queries | |
| emoji_pattern = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" # emoticons | |
| "\U0001F300-\U0001F5FF" # symbols & pictographs | |
| "\U0001F680-\U0001F6FF" # transport & map symbols | |
| "\U0001F700-\U0001F77F" # alchemical symbols | |
| "\U0001F780-\U0001F7FF" # Geometric Shapes | |
| "\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
| "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
| "\U0001FA00-\U0001FA6F" # Chess Symbols | |
| "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
| "\U00002702-\U000027B0" # Dingbats | |
| "\U000024C2-\U0001F251" | |
| "]+" | |
| ) | |
| stripped_query = emoji_pattern.sub(r'', query).strip() | |
| if not stripped_query and len(query) <= 5: # Single emoji or very short | |
| return False | |
| # Reject random numbers only (at least 5 digits with no context) | |
| if re.match(r'^\d{5,}$', query.strip()): | |
| return False | |
| # Reject gibberish (no vowels in long string suggests gibberish) | |
| if len(query) > 10 and not re.search(r'[aeiouAEIOU]', query): | |
| return False | |
| return True | |
| def format_research_results(search_results: List[Dict[str, Any]], | |
| scraped_contents: Dict[str, str], | |
| analyzed_contents: Dict[str, Dict[str, Any]]) -> str: | |
| """ | |
| Formats research results into a readable response with citations. | |
| Args: | |
| search_results: The list of search result items | |
| scraped_contents: Dict mapping URLs to scraped content | |
| analyzed_contents: Dict mapping URLs to analysis results | |
| Returns: | |
| Formatted response with citations | |
| """ | |
| response_parts = [] | |
| citations = [] | |
| # Filter to only include relevant content based on analysis | |
| relevant_urls = { | |
| url: data | |
| for url, data in analyzed_contents.items() | |
| if data.get("relevance_score", 0) >= 5 | |
| } | |
| # No relevant results | |
| if not relevant_urls: | |
| return "I couldn't find relevant information for your query. Could you try rephrasing or providing more details?" | |
| # Compile the response with relevant information | |
| for i, (url, data) in enumerate(relevant_urls.items(), 1): | |
| citations.append(f"[{i}] {url}") | |
| filtered_content = data.get("filtered_content", "") | |
| # Add the content with citation | |
| if filtered_content: | |
| response_parts.append(f"{filtered_content} [{i}]") | |
| # Combine everything | |
| response = "\n\n".join(response_parts) | |
| citation_text = "\n".join(citations) | |
| return f"{response}\n\nSources:\n{citation_text}" | |
| def extract_citations(text: str) -> List[Dict[str, str]]: | |
| """ | |
| Extract citations from formatted text. | |
| Args: | |
| text: Text with citation markers like [1], [2], etc. | |
| Returns: | |
| List of citation objects with citation number and referenced text | |
| """ | |
| citations = [] | |
| citation_pattern = r'\[(\d+)\]' | |
| matches = re.finditer(citation_pattern, text) | |
| for match in matches: | |
| citation_num = match.group(1) | |
| # Get the preceding text (limited to reasonable length) | |
| start_pos = max(0, match.start() - 100) | |
| cited_text = text[start_pos:match.start()].strip() | |
| if len(cited_text) == 100: # Truncated | |
| cited_text = "..." + cited_text | |
| citations.append({ | |
| "number": citation_num, | |
| "text": cited_text | |
| }) | |
| return citations |