""" Gradio app for sentence similarity search Launch with: python app-xyz.py """ import gradio as gr import numpy as np import pandas as pd from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import os from functools import lru_cache # Configuration setting ='web' # 'local'# modelName = "all-MiniLM-L6-v2" if setting=='local': modelDirectory = "../../ml-data/similarity/finetuned" dataDirectory = "../../ml-data/similarity" corpusDirectory = "../../data" else: modelDirectory = "./model" dataDirectory = "./data" corpusDirectory = "./corpus" # Global variables for loaded data model = None M = None metainfo = None def load_model(): """Load the sentence transformer model""" global model if model is None: model = SentenceTransformer(f"{modelDirectory}/{modelName}") model.eval() return model def load_embeddings(): """Load precomputed embeddings""" global M if M is None: npyPath = f"{dataDirectory}/embeddings-{modelName}.npy" if not os.path.exists(npyPath): # Convert csv to npy if needed e = np.loadtxt(f"{dataDirectory}/embeddings-{modelName}.csv", delimiter="\t") np.save(npyPath, e) M = np.load(npyPath) return M def load_metainfo(): """Load metadata information""" global metainfo if metainfo is None: metainfo = pd.read_csv(f"{dataDirectory}/metainfo.csv", sep="\t", header=None, names=["genre", "text", "citation", "type", "txttype"]) return metainfo @lru_cache(maxsize=128) def load_corpus_file(filepath): """Load corpus file with caching""" return pd.read_csv(filepath, sep="\t", header=None, names=["id", "text"], dtype=str) def get_top_k(query, k): """Retrieve top-k similar sentences""" model = load_model() embeddings = load_embeddings() query_vec = model.encode([query]) sims = cosine_similarity(query_vec, embeddings)[0] top_k_idx = np.argsort(sims)[-k:][::-1] return [(idx, sims[idx]) for idx in top_k_idx] def search_similar_sentences(query, top_k): """Main search function""" if not query.strip(): return "Please enter a query sentence." try: # Load data if not already loaded metainfo = load_metainfo() # Get similar sentences results = get_top_k(query, 5 * top_k) keys = set() # Format results output_lines = [f"**Top {top_k} similar sentences for:** _{query}_\n"] added = 0 for i, (idx, score) in enumerate(results): genre, text, citation, _, _ = metainfo.iloc[idx] # When sentences are split, or there is an MT and a human translation of one sentence, # the same passage can occur multiple times. Prevent this! key = f"{genre}-{text}-{citation}" if key in keys: continue keys.add(key) txtPath = os.path.join(corpusDirectory, genre, f"{text}.txt") try: src = load_corpus_file(txtPath) row = src[src["id"] == citation] if not row.empty: sentence = row["text"].values[0] # Add context (previous sentence) if idx > 0: _, _, citationPrev, _, _ = metainfo.iloc[idx-1] rowPrev = src[src["id"] == citationPrev] if not rowPrev.empty: sentence = f"{rowPrev['text'].values[0]} / **{sentence}**" # Add context (next sentence) if idx < len(metainfo) - 1: _, _, citationNext, _, _ = metainfo.iloc[idx + 1] rowNext = src[src["id"] == citationNext] if not rowNext.empty: sentence += f" / {rowNext['text'].values[0]}" added += 1 else: sentence = f"[Line {citation} not found in {text}]" except Exception as e: sentence = f"[Error loading {text}: {str(e)}]" icon = "⭐" if score >= 0.8 else "" output_lines.append( f"{icon}**{i+1}. {genre}/{text}:{citation}** {sentence}\n" f"*[similarity: {score:.3f}]*\n" ) if added==top_k: break return "\n".join(output_lines) except Exception as e: return f"Error: {str(e)}" def create_interface(): """Create and launch Gradio interface""" with gr.Blocks(title="Sentence Similarity Search") as demo: gr.Markdown("# Sentence Similarity Search") gr.Markdown("Enter a sentence to find the most similar sentences in the VPC.") with gr.Accordion("💡 How to use this search tool", open=False): gr.Markdown(""" This tool searches for semantically similar sentences in the [Vedic Prose Corpus](https://github.com/OliverHellwig/sanskrit/tree/master/corpus/VPC) (VPC). It works on **English** machine translations of all VPC texts generated with Sebastian Nehrdich's Dharmamitra API. Therefore, your queries should resemble the style and vocabulary of these translations. **Example queries that work:** - *"The stoma consists of 17 parts."* - *"The gods drive away the cattle of the Asuras."* - *"Cows are like Soma."* - *"They dig a hole at the sacrificial ground."* **What will not work well (or at all):** - Sanskrit text. This tool operates on English translations. - Asking questions ("What is the meaning of the sacrifice?") or prompt instructions ("List all passages describing the agnistoma.") - this tool finds existing passages, it does not generate answers. - Contemporary paraphrases or colloquial language - Very short phrases or single words **Tips:** - Use complete, well-formed sentences. - Try to match the register of Vedic translations. - Try variations with different synonyms if initial results are poor. - Similarity scores above 0.8 (strong matches) are marked with a star ⭐. - Lower scores (0.6-0.8) may still contain relevant parallels worth exploring. **Technical details:** The search uses *all-MiniLM-L6-v2* finetuned with several thousand records (partly human judgments, partly prompt-generated). """) with gr.Row(): with gr.Column(scale=3): query_input = gr.Textbox( label="Enter a sentence:", placeholder="Type your sentence here...", lines=2 ) with gr.Column(scale=1): top_k_slider = gr.Slider( minimum=5, maximum=100, value=10, step=5, label="Number of results" ) search_button = gr.Button("Search Similar Sentences", variant="primary") output_display = gr.Markdown( label="Results", value="Enter a query and click 'Search Similar Sentences' to see results." ) # Search button search_button.click( fn=search_similar_sentences, inputs=[query_input, top_k_slider], outputs=output_display ) # Trigger search on Enter key query_input.submit( fn=search_similar_sentences, inputs=[query_input, top_k_slider], outputs=output_display ) return demo if __name__ == "__main__": ui = create_interface() if setting=='local': ui.launch( server_name="127.0.0.1", server_port=7860, share=False ) else: ui.launch(share=True)