Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import schedule | |
| import time | |
| import requests | |
| import threading | |
| from search_utils import SemanticSearch | |
| import logging | |
| import time | |
| import psutil | |
| from urllib.parse import quote | |
| import threading | |
| import re | |
| from pathlib import Path | |
| from urllib.parse import urlparse | |
| def ping_server(): | |
| try: | |
| print("Pinging server") | |
| response = requests.get("https://testys-semantic-search.hf.space") | |
| except requests.exceptions.RequestException as e: | |
| print("Server is down") | |
| schedule.every(10).minutes.do(ping_server) | |
| def run_schedule(): | |
| while True: | |
| schedule.run_pending() | |
| time.sleep(1) | |
| thread = threading.Thread(target=run_schedule) | |
| thread.daemon = True | |
| thread.start() | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[logging.StreamHandler()] | |
| ) | |
| logger = logging.getLogger("SemanticSearchApp") | |
| # Security validation functions | |
| def is_valid_url(url): | |
| """Validate URL format and safety""" | |
| try: | |
| result = urlparse(url) | |
| return all([result.scheme, result.netloc]) | |
| except: | |
| return False | |
| def sanitize_query(query): | |
| """Sanitize user input to prevent injection attacks""" | |
| try: | |
| return re.sub(r'[^\w\s-]', '', query)[:256] | |
| except Exception as e: | |
| logger.error(f"Query sanitization failed: {str(e)}") | |
| return query[:256] | |
| def add_diagnostics_ui(search_system): | |
| """Enhanced diagnostics with accurate path handling""" | |
| with st.sidebar.expander("π§ Diagnostics", expanded=False): | |
| col1, col2 = st.columns(2) | |
| # Metadata validation | |
| with col1: | |
| st.subheader("π Metadata Validation") | |
| metadata_dir = search_system.metadata_mgr.metadata_path | |
| if metadata_dir.exists(): | |
| parquet_files = [metadata_dir] | |
| status = len(parquet_files) > 0 | |
| st.write(f"Directory: `{metadata_dir}`") | |
| st.write(f"Parquet Files: {len(parquet_files)}") | |
| st.success("β Valid metadata" if status else "β No parquet files found") | |
| else: | |
| st.error("Metadata directory not found") | |
| # FAISS validation | |
| with col2: | |
| st.subheader("π FAISS Validation") | |
| faiss_path = search_system.shard_dir | |
| if faiss_path.exists(): | |
| st.write(f"Index Path: `{faiss_path}`") | |
| st.success(f"β Index loaded") | |
| st.write(f"Vectors: {search_system.total_vectors}") | |
| else: | |
| st.error("FAISS index not found") | |
| # System resources | |
| st.subheader("π» System Resources") | |
| col_res1, col_res2 = st.columns(2) | |
| with col_res1: | |
| mem_usage = psutil.Process().memory_info().rss // 1024 ** 2 | |
| st.metric("Memory Usage", f"{mem_usage} MB") | |
| with col_res2: | |
| cpu_usage = psutil.cpu_percent() | |
| status_color = "#ff0000" if cpu_usage > 80 else "#00ff00" | |
| st.markdown(f"<span style='color:{status_color}'>CPU: {cpu_usage}%</span>", | |
| unsafe_allow_html=True) | |
| def main(): | |
| st.set_page_config( | |
| page_title="Semantic Search Engine", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| # Custom CSS styling | |
| st.markdown(""" | |
| <style> | |
| .metric-box { | |
| padding: 15px; | |
| border-radius: 8px; | |
| background: #f8f9fa; | |
| margin: 10px 0; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| } | |
| .result-card { | |
| padding: 15px; | |
| border-left: 4px solid #1e88e5; | |
| margin: 10px 0; | |
| background: #fff; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Initialize search system | |
| def init_search_system(): | |
| try: | |
| system = SemanticSearch() | |
| system.initialize_system() | |
| logger.info("Search system initialized") | |
| return system | |
| except Exception as e: | |
| logger.error(f"Initialization failed: {str(e)}") | |
| st.error("System initialization error") | |
| st.stop() | |
| try: | |
| search_system = init_search_system() | |
| except Exception as e: | |
| st.error(f"Critical error: {str(e)}") | |
| st.stop() | |
| # Main UI components | |
| st.title("π Academics Research Semantics Search Engine") | |
| query = st.text_input("Search knowledge base:", placeholder="Enter your query...") | |
| if query: | |
| clean_query = sanitize_query(query) | |
| if not clean_query: | |
| st.warning("Invalid query format") | |
| st.stop() | |
| with st.spinner("Analyzing documents..."): | |
| start_time = time.time() | |
| try: | |
| results = search_system.search(clean_query, 5) | |
| search_duration = time.time() - start_time | |
| if results.empty: | |
| st.warning("No matches found") | |
| st.info("Try refining your search terms") | |
| else: | |
| st.subheader(f"Top Results ({search_duration:.2f}s)") | |
| for index, res in results.iterrows(): # Use iterrows to iterate through rows if it's a DataFrame | |
| logger.info(f"Results: {res}") | |
| with st.expander(res["title"]): | |
| st.markdown(f"**Summary**: {res['summary']}") | |
| similarity = res['similarity'] | |
| st.progress(similarity) | |
| st.markdown(f"**Confidence**: {similarity:.1%}") | |
| st.markdown(f"**Authors**: {res['authors']}") | |
| source = res['source'] | |
| st.write("**Sources**:") | |
| for url in res['source']: | |
| st.markdown(f"- [{url}]({url})") | |
| st.write(f"[Google Scholar Search](https://scholar.google.com/scholar?q={quote(res['title'])})") | |
| except Exception as e: | |
| logger.error(f"Search error: {str(e)}") | |
| st.error("Search operation failed") | |
| # System status sidebar | |
| with st.sidebar: | |
| st.subheader("π System Health") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("Documents", f"{search_system.metadata_mgr.total_docs:,}") | |
| with col2: | |
| vectors = search_system.total_vectors | |
| st.metric("Vectors", f"{vectors:,}") | |
| # Diagnostics section | |
| if st.checkbox("Show advanced diagnostics"): | |
| add_diagnostics_ui(search_system) | |
| # System monitoring | |
| st.subheader("βοΈ Monitoring") | |
| with st.expander("Performance"): | |
| mem = psutil.virtual_memory() | |
| st.write(f"Memory: {mem.percent}% used") | |
| st.write(f"CPU Cores: {psutil.cpu_count()}") | |
| st.write(f"Active threads: {threading.active_count()}") | |
| if st.button("π Refresh System"): | |
| st.cache_resource.clear() | |
| st.rerun() | |
| # Footer | |
| st.markdown(""" | |
| <footer style="text-align: center; padding: 1rem; margin-top: 2rem; border-top: 1px solid #ddd;"> | |
| <small>© 2025 Joel Adesanya's MSc Project.</small> | |
| </footer> | |
| """, unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() | |