Spaces:

OliverHellwig
/

vpcsearch

Sleeping

File size: 8,320 Bytes

"""
Gradio app for sentence similarity search
Launch with: python app-xyz.py
"""
import gradio as gr
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
from functools import lru_cache

# Configuration
setting ='web' #  'local'# 
modelName = "all-MiniLM-L6-v2"
if setting=='local':
    modelDirectory = "../../ml-data/similarity/finetuned" 
    dataDirectory = "../../ml-data/similarity"
    corpusDirectory = "../../data"
else:
    modelDirectory = "./model" 
    dataDirectory = "./data"
    corpusDirectory = "./corpus"

# Global variables for loaded data
model = None
M = None
metainfo = None

def load_model():
    """Load the sentence transformer model"""
    global model
    if model is None:
        model = SentenceTransformer(f"{modelDirectory}/{modelName}")
        model.eval()
    return model

def load_embeddings():
    """Load precomputed embeddings"""
    global M
    if M is None:
        npyPath = f"{dataDirectory}/embeddings-{modelName}.npy"
        if not os.path.exists(npyPath):
            # Convert csv to npy if needed
            e = np.loadtxt(f"{dataDirectory}/embeddings-{modelName}.csv", delimiter="\t")
            np.save(npyPath, e)
        M = np.load(npyPath)
    return M

def load_metainfo():
    """Load metadata information"""
    global metainfo
    if metainfo is None:
        metainfo = pd.read_csv(f"{dataDirectory}/metainfo.csv", sep="\t",
                              header=None,
                              names=["genre", "text", "citation", "type", "txttype"])
    return metainfo

@lru_cache(maxsize=128)
def load_corpus_file(filepath):
    """Load corpus file with caching"""
    return pd.read_csv(filepath, sep="\t", header=None, names=["id", "text"], dtype=str)

def get_top_k(query, k):
    """Retrieve top-k similar sentences"""
    model = load_model()
    embeddings = load_embeddings()
    
    query_vec = model.encode([query])
    sims = cosine_similarity(query_vec, embeddings)[0]
    top_k_idx = np.argsort(sims)[-k:][::-1]
    return [(idx, sims[idx]) for idx in top_k_idx]

def search_similar_sentences(query, top_k):
    """Main search function"""
    if not query.strip():
        return "Please enter a query sentence."
    
    try:
        # Load data if not already loaded
        metainfo = load_metainfo()
        
        # Get similar sentences
        results = get_top_k(query, 5 * top_k)
        keys = set()
        # Format results
        output_lines = [f"**Top {top_k} similar sentences for:** _{query}_\n"]
        added = 0
        for i, (idx, score) in enumerate(results):
            genre, text, citation, _, _ = metainfo.iloc[idx]
            # When sentences are split, or there is an MT and a human translation of one sentence, 
            # the same passage can occur multiple times. Prevent this!
            key = f"{genre}-{text}-{citation}" 
            if key in keys:
                continue
            keys.add(key)
            txtPath = os.path.join(corpusDirectory, genre, f"{text}.txt")
            
            try:
                src = load_corpus_file(txtPath)
                row = src[src["id"] == citation]
                
                if not row.empty:
                    sentence = row["text"].values[0]
                    
                    # Add context (previous sentence)
                    if idx > 0:
                        _, _, citationPrev, _, _ = metainfo.iloc[idx-1]
                        rowPrev = src[src["id"] == citationPrev]
                        if not rowPrev.empty:
                            sentence = f"{rowPrev['text'].values[0]} / **{sentence}**"
                    
                    # Add context (next sentence)
                    if idx < len(metainfo) - 1:
                        _, _, citationNext, _, _ = metainfo.iloc[idx + 1]
                        rowNext = src[src["id"] == citationNext]
                        if not rowNext.empty:
                            sentence += f" / {rowNext['text'].values[0]}"
                    added += 1
                else:
                    sentence = f"[Line {citation} not found in {text}]"
                    
            except Exception as e:
                sentence = f"[Error loading {text}: {str(e)}]"
            
            icon = "⭐" if score >= 0.8 else ""
            output_lines.append(
                f"{icon}**{i+1}. {genre}/{text}:{citation}** {sentence}\n"
                f"*[similarity: {score:.3f}]*\n"
            )
            if added==top_k:
                break
        return "\n".join(output_lines)
        
    except Exception as e:
        return f"Error: {str(e)}"

def create_interface():
    """Create and launch Gradio interface"""
    
    with gr.Blocks(title="Sentence Similarity Search") as demo:
        gr.Markdown("# Sentence Similarity Search")
        gr.Markdown("Enter a sentence to find the most similar sentences in the VPC.")

        with gr.Accordion("💡 How to use this search tool", open=False):
            gr.Markdown("""
            This tool searches for semantically similar sentences in the [Vedic Prose Corpus](https://github.com/OliverHellwig/sanskrit/tree/master/corpus/VPC) (VPC).
            It works on **English** machine translations of all VPC texts generated with Sebastian Nehrdich's Dharmamitra API.
            Therefore, your queries should resemble the style and vocabulary of these translations.
                        
            **Example queries that work:**
            - *"The stoma consists of 17 parts."*
            - *"The gods drive away the cattle of the Asuras."*
            - *"Cows are like Soma."*
            - *"They dig a hole at the sacrificial ground."*
                        
            **What will not work well (or at all):**
            - Sanskrit text. This tool operates on English translations.
            - Asking questions ("What is the meaning of the sacrifice?") or prompt instructions ("List all passages describing the agnistoma.") - this tool finds existing passages, it does not generate answers.
            - Contemporary paraphrases or colloquial language
            - Very short phrases or single words

            **Tips:**
            - Use complete, well-formed sentences.
            - Try to match the register of Vedic translations.
            - Try variations with different synonyms if initial results are poor.
            - Similarity scores above 0.8 (strong matches) are marked with a star ⭐.
            - Lower scores (0.6-0.8) may still contain relevant parallels worth exploring.
                        
            **Technical details:** The search uses *all-MiniLM-L6-v2* finetuned with several thousand records (partly human judgments, partly prompt-generated).
            """)
        
        with gr.Row():
            with gr.Column(scale=3):
                query_input = gr.Textbox(
                    label="Enter a sentence:",
                    placeholder="Type your sentence here...",
                    lines=2
                )
            with gr.Column(scale=1):
                top_k_slider = gr.Slider(
                    minimum=5,
                    maximum=100,
                    value=10,
                    step=5,
                    label="Number of results"
                )
        
        search_button = gr.Button("Search Similar Sentences", variant="primary")
        
        output_display = gr.Markdown(
            label="Results",
            value="Enter a query and click 'Search Similar Sentences' to see results."
        )
        
        # Search button
        search_button.click(
            fn=search_similar_sentences,
            inputs=[query_input, top_k_slider],
            outputs=output_display
        )
        
        # Trigger search on Enter key
        query_input.submit(
            fn=search_similar_sentences,
            inputs=[query_input, top_k_slider],
            outputs=output_display
        )
    
    return demo

if __name__ == "__main__":
    ui = create_interface()
    if setting=='local':
        ui.launch(
            server_name="127.0.0.1",
            server_port=7860,
            share=False
        )
    else:
        ui.launch(share=True)