Spaces:
Sleeping
Sleeping
| import re | |
| import torch | |
| import streamlit as st | |
| from sentence_transformers import SentenceTransformer, util | |
| from embeddings.embedder import initialize_embedding_model, initialize_chroma | |
| from config import EMBEDDINGS_DIR, EMBEDDING_MODEL_NAME | |
| from embeddings.latex_to_unicode import LATEX_TO_UNICODE | |
| def decode_latex(text: str) -> str: | |
| for latex, uni in LATEX_TO_UNICODE.items(): | |
| text = text.replace(latex, uni) | |
| text = re.sub(r"\\[a-zA-Z]+(\{.*?\})?", "", text) | |
| return text.replace("{", "").replace("}", "").strip() | |
| # Used to calculate sentences similarity in one file | |
| sentence_model = SentenceTransformer(EMBEDDING_MODEL_NAME) | |
| def best_sentence_by_embedding(content: str, query: str): | |
| sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", content) if s.strip()] | |
| embeddings = sentence_model.encode(sentences + [query], convert_to_tensor=True) | |
| cosine_scores = util.cos_sim(embeddings[-1], embeddings[:-1])[0] | |
| best_idx = int(torch.argmax(cosine_scores)) | |
| return sentences[best_idx], cosine_scores[best_idx].item() | |
| def semantic_search(vectordb, query, k=5): | |
| try: | |
| results_with_scores = vectordb.similarity_search_with_score(query, k=k) | |
| except Exception: | |
| raw_results = vectordb.similarity_search(query, k=k) | |
| results_with_scores = [(r, None) for r in raw_results] | |
| return results_with_scores | |
| def normalize_score(distance): | |
| if distance is None: | |
| return 0.0 | |
| return 1 / (1 + distance) | |
| def get_user_input(): | |
| query = st.text_input("Enter search query:") | |
| k = st.slider("Number of results", min_value=1, max_value=10, value=5) | |
| return query, k | |
| def truncate_sentence(text: str, max_len: int = 1000) -> str: | |
| return text[:max_len] + ("..." if len(text) > max_len else "") | |
| def process_results(results_with_scores, query): | |
| ranked_results = [] | |
| seen_ids = set() | |
| for doc, doc_score in results_with_scores: | |
| metadata = doc.metadata or {} | |
| doc_id = metadata.get("id", "N/A") | |
| if doc_id in seen_ids: | |
| continue | |
| seen_ids.add(doc_id) | |
| categories = metadata.get("categories", "N/A") | |
| year = metadata.get("year", "N/A") | |
| raw_content = decode_latex(doc.page_content) | |
| title = raw_content.split(". ", 1)[0].replace("Title: ", "").strip() | |
| content = raw_content.split("Abstract:", 1)[1].strip() | |
| best_sentence, local_relevance = best_sentence_by_embedding(content, query) | |
| final_score = 0.6 * local_relevance + 0.4 * (1 - doc_score) | |
| ranked_results.append( | |
| { | |
| "doc": doc, | |
| "doc_id": doc_id, | |
| "categories": categories, | |
| "year": year, | |
| "title": title, | |
| "content": content, | |
| "best_sentence": best_sentence, | |
| "local_relevance": local_relevance, | |
| "doc_score": doc_score, | |
| "final_score": final_score, | |
| } | |
| ) | |
| return sorted(ranked_results, key=lambda x: x["final_score"], reverse=True) | |
| def display_results(ranked_results): | |
| st.success(f"Top {len(ranked_results)} results found:") | |
| for i, r in enumerate(ranked_results, 1): | |
| content = r["content"] | |
| highlighted_content = content.replace( | |
| r["best_sentence"], f"**{r['best_sentence']}**", 1 | |
| ) | |
| st.markdown(f"**RESULT {i}:**") | |
| st.markdown( | |
| f"Document ID: {r['doc_id']} | Categories: {r['categories']} | Year: {r['year']} | " | |
| f"Doc Relevance: {1 - (r['doc_score'] if r['doc_score'] else 0):.2f} | " | |
| f"Best Sentence Relevance: {r['local_relevance']:.2f}" | |
| ) | |
| st.markdown(f"Title: {r['title']}") | |
| st.markdown(f"Most Relevant Excerpt: {truncate_sentence(highlighted_content)}") | |
| st.markdown("---") | |
| def run_search(embedding_model=None, vectordb=None): | |
| st.header("🔎 Semantic Search") | |
| st.subheader("Search for semantically similar documents") | |
| if embedding_model is None: | |
| embedding_model = initialize_embedding_model() | |
| if vectordb is None: | |
| vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR) | |
| if not vectordb: | |
| st.warning("No ChromaDB found. Run embeddings generation first.") | |
| return | |
| query, k = get_user_input() | |
| if not query: | |
| st.info("Type a query above to start searching.") | |
| return | |
| results_with_scores = semantic_search(vectordb, query, k=k * 2) | |
| if not results_with_scores: | |
| st.warning("No results found.") | |
| return | |
| ranked_results = process_results(results_with_scores, query) | |
| display_results(ranked_results) | |