Spaces:
Sleeping
Sleeping
| # main.py - FastAPI Backend | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.staticfiles import StaticFiles | |
| from fastapi.responses import HTMLResponse | |
| from pydantic import BaseModel | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| # Download required NLTK data | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('punkt_tab') | |
| # Initialize FastAPI app | |
| app = FastAPI(title="Simple Search Engine") | |
| # Add CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Define the document database | |
| documents = { | |
| "doc1": """ | |
| A new AI analytics tool has been released by TechCorp. | |
| This tool uses advanced machine learning algorithms to process large datasets. | |
| It can provide real-time insights and predictive analytics for businesses. | |
| The tool integrates seamlessly with existing data infrastructure. | |
| Companies can now make data-driven decisions faster than ever before. | |
| The AI engine continuously learns from new data to improve accuracy. | |
| """, | |
| "doc2": """ | |
| The quarterly finance report shows strong revenue growth. | |
| Operating expenses have decreased by 15% compared to last quarter. | |
| Net profit margins have improved significantly across all divisions. | |
| The company's cash flow remains healthy with substantial reserves. | |
| Investment in new projects is expected to yield returns next year. | |
| Shareholders can expect increased dividends this quarter. | |
| """, | |
| "doc3": """ | |
| Cloud infrastructure services from AWS and Azure are becoming essential. | |
| Companies are migrating their legacy systems to the cloud for better scalability. | |
| AWS offers a wide range of compute and storage options. | |
| Azure provides excellent integration with Microsoft enterprise products. | |
| Both platforms support hybrid cloud deployments for flexibility. | |
| Security and compliance features are continuously being enhanced. | |
| """, | |
| "doc4": """ | |
| Our new marketing campaign focuses on SEO optimization strategies. | |
| We are targeting high-value keywords to increase organic traffic. | |
| Social media engagement has improved by 40% this month. | |
| Content marketing efforts are driving more qualified leads. | |
| The campaign includes email marketing and paid search ads. | |
| We expect to see ROI improvements within the next quarter. | |
| """, | |
| "doc5": """ | |
| The AI tool leverages machine learning for predictive maintenance. | |
| Machine learning models can detect patterns in equipment behavior. | |
| This AI-powered solution reduces downtime and operational costs. | |
| Deep learning techniques are applied to analyze sensor data. | |
| The system continuously learns and adapts to new scenarios. | |
| AI and machine learning are transforming industrial operations. | |
| """ | |
| } | |
| # Function to chunk documents | |
| def chunk_documents(documents, sentences_per_chunk=3): | |
| chunks = [] | |
| chunk_metadata = [] | |
| for doc_id, text in documents.items(): | |
| sentences = sent_tokenize(text.strip()) | |
| for i in range(0, len(sentences), sentences_per_chunk): | |
| chunk = ' '.join(sentences[i:i+sentences_per_chunk]) | |
| chunks.append(chunk) | |
| chunk_metadata.append({ | |
| 'doc_id': doc_id, | |
| 'chunk_index': i // sentences_per_chunk, | |
| 'text': chunk | |
| }) | |
| return chunks, chunk_metadata | |
| # Initialize model and process documents at startup | |
| print("Initializing search engine...") | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| chunks, chunk_metadata = chunk_documents(documents) | |
| chunk_embeddings = model.encode(chunks) | |
| print(f"Search engine ready! {len(chunks)} chunks indexed.") | |
| # Pydantic models | |
| class SearchQuery(BaseModel): | |
| query: str | |
| class SearchResult(BaseModel): | |
| rank: int | |
| doc_id: str | |
| similarity_score: float | |
| text: str | |
| # API Endpoints | |
| async def read_root(): | |
| html_content = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Simple Search Engine</title> | |
| <style> | |
| * { | |
| margin: 0; | |
| padding: 0; | |
| box-sizing: border-box; | |
| } | |
| body { | |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| min-height: 100vh; | |
| padding: 20px; | |
| } | |
| .container { | |
| max-width: 900px; | |
| margin: 0 auto; | |
| } | |
| .header { | |
| text-align: center; | |
| color: white; | |
| margin-bottom: 40px; | |
| padding-top: 60px; | |
| } | |
| .header h1 { | |
| font-size: 3em; | |
| margin-bottom: 10px; | |
| text-shadow: 2px 2px 4px rgba(0,0,0,0.3); | |
| } | |
| .header p { | |
| font-size: 1.2em; | |
| opacity: 0.9; | |
| } | |
| .search-box { | |
| background: white; | |
| border-radius: 50px; | |
| padding: 10px 20px; | |
| box-shadow: 0 8px 30px rgba(0,0,0,0.3); | |
| display: flex; | |
| align-items: center; | |
| margin-bottom: 40px; | |
| } | |
| .search-box input { | |
| flex: 1; | |
| border: none; | |
| outline: none; | |
| font-size: 1.1em; | |
| padding: 10px; | |
| } | |
| .search-box button { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border: none; | |
| color: white; | |
| padding: 12px 30px; | |
| border-radius: 25px; | |
| font-size: 1em; | |
| cursor: pointer; | |
| transition: transform 0.2s; | |
| font-weight: bold; | |
| } | |
| .search-box button:hover { | |
| transform: scale(1.05); | |
| } | |
| .search-box button:active { | |
| transform: scale(0.95); | |
| } | |
| .loading { | |
| text-align: center; | |
| color: white; | |
| font-size: 1.2em; | |
| margin: 20px 0; | |
| display: none; | |
| } | |
| .loading.show { | |
| display: block; | |
| } | |
| .results { | |
| display: none; | |
| } | |
| .results.show { | |
| display: block; | |
| } | |
| .result-card { | |
| background: white; | |
| border-radius: 15px; | |
| padding: 25px; | |
| margin-bottom: 20px; | |
| box-shadow: 0 4px 15px rgba(0,0,0,0.2); | |
| transition: transform 0.2s, box-shadow 0.2s; | |
| animation: slideIn 0.5s ease-out; | |
| } | |
| @keyframes slideIn { | |
| from { | |
| opacity: 0; | |
| transform: translateY(20px); | |
| } | |
| to { | |
| opacity: 1; | |
| transform: translateY(0); | |
| } | |
| } | |
| .result-card:hover { | |
| transform: translateY(-5px); | |
| box-shadow: 0 6px 25px rgba(0,0,0,0.3); | |
| } | |
| .result-header { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: center; | |
| margin-bottom: 15px; | |
| } | |
| .result-rank { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| padding: 5px 15px; | |
| border-radius: 20px; | |
| font-weight: bold; | |
| font-size: 0.9em; | |
| } | |
| .result-doc { | |
| color: #666; | |
| font-size: 0.9em; | |
| font-weight: 600; | |
| } | |
| .result-score { | |
| background: #e8f5e9; | |
| color: #2e7d32; | |
| padding: 5px 12px; | |
| border-radius: 15px; | |
| font-size: 0.85em; | |
| font-weight: bold; | |
| } | |
| .result-text { | |
| color: #333; | |
| line-height: 1.6; | |
| font-size: 1em; | |
| } | |
| .no-results { | |
| text-align: center; | |
| color: white; | |
| font-size: 1.2em; | |
| margin-top: 40px; | |
| display: none; | |
| } | |
| .no-results.show { | |
| display: block; | |
| } | |
| .stats { | |
| text-align: center; | |
| color: white; | |
| margin-bottom: 30px; | |
| font-size: 1.1em; | |
| opacity: 0.9; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <div class="header"> | |
| <h1>🔍 SimpleSearch</h1> | |
| <p>Your intelligent document search engine</p> | |
| </div> | |
| <div class="search-box"> | |
| <input type="text" id="searchInput" placeholder="Search for documents..." /> | |
| <button onclick="performSearch()">Search</button> | |
| </div> | |
| <div class="loading" id="loading"> | |
| <p>🔄 Searching...</p> | |
| </div> | |
| <div class="stats" id="stats"></div> | |
| <div class="results" id="results"></div> | |
| <div class="no-results" id="noResults"> | |
| <p>No results found. Try a different query!</p> | |
| </div> | |
| </div> | |
| <script> | |
| // Allow Enter key to trigger search | |
| document.getElementById('searchInput').addEventListener('keypress', function(e) { | |
| if (e.key === 'Enter') { | |
| performSearch(); | |
| } | |
| }); | |
| async function performSearch() { | |
| const query = document.getElementById('searchInput').value.trim(); | |
| if (!query) { | |
| alert('Please enter a search query!'); | |
| return; | |
| } | |
| // Show loading, hide results | |
| document.getElementById('loading').classList.add('show'); | |
| document.getElementById('results').classList.remove('show'); | |
| document.getElementById('noResults').classList.remove('show'); | |
| document.getElementById('stats').innerHTML = ''; | |
| try { | |
| const response = await fetch('/search', { | |
| method: 'POST', | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| }, | |
| body: JSON.stringify({ query: query }) | |
| }); | |
| if (!response.ok) { | |
| throw new Error('Search failed'); | |
| } | |
| const data = await response.json(); | |
| displayResults(data, query); | |
| } catch (error) { | |
| console.error('Error:', error); | |
| alert('Search failed. Please try again.'); | |
| } finally { | |
| document.getElementById('loading').classList.remove('show'); | |
| } | |
| } | |
| function displayResults(results, query) { | |
| const resultsDiv = document.getElementById('results'); | |
| const noResultsDiv = document.getElementById('noResults'); | |
| const statsDiv = document.getElementById('stats'); | |
| if (results.length === 0) { | |
| noResultsDiv.classList.add('show'); | |
| return; | |
| } | |
| statsDiv.innerHTML = `Found <strong>${results.length}</strong> results for "<strong>${query}</strong>"`; | |
| resultsDiv.innerHTML = ''; | |
| results.forEach(result => { | |
| const card = document.createElement('div'); | |
| card.className = 'result-card'; | |
| card.style.animationDelay = `${(result.rank - 1) * 0.1}s`; | |
| card.innerHTML = ` | |
| <div class="result-header"> | |
| <div style="display: flex; gap: 10px; align-items: center;"> | |
| <span class="result-rank">Rank ${result.rank}</span> | |
| <span class="result-doc">${result.doc_id.toUpperCase()}</span> | |
| </div> | |
| <span class="result-score">Score: ${result.similarity_score.toFixed(4)}</span> | |
| </div> | |
| <div class="result-text">${result.text}</div> | |
| `; | |
| resultsDiv.appendChild(card); | |
| }); | |
| resultsDiv.classList.add('show'); | |
| } | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| return HTMLResponse(content=html_content) | |
| async def search(search_query: SearchQuery): | |
| """ | |
| Search endpoint that takes a query and returns top 5 relevant chunks | |
| """ | |
| if not search_query.query.strip(): | |
| raise HTTPException(status_code=400, detail="Query cannot be empty") | |
| try: | |
| # Encode the query | |
| query_embedding = model.encode([search_query.query]) | |
| # Calculate cosine similarity | |
| similarities = cosine_similarity(query_embedding, chunk_embeddings)[0] | |
| # Create results | |
| results = [] | |
| for idx, score in enumerate(similarities): | |
| results.append({ | |
| 'chunk_index': idx, | |
| 'doc_id': chunk_metadata[idx]['doc_id'], | |
| 'similarity_score': float(score), | |
| 'text': chunk_metadata[idx]['text'] | |
| }) | |
| # Sort by similarity score | |
| results_sorted = sorted(results, key=lambda x: x['similarity_score'], reverse=True) | |
| # Return top 5 results | |
| top_results = [] | |
| for rank, result in enumerate(results_sorted[:5], 1): | |
| top_results.append(SearchResult( | |
| rank=rank, | |
| doc_id=result['doc_id'], | |
| similarity_score=result['similarity_score'], | |
| text=result['text'] | |
| )) | |
| return top_results | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Search error: {str(e)}") | |
| async def health_check(): | |
| """Health check endpoint""" | |
| return {"status": "healthy", "total_chunks": len(chunks)} | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=8000) |