bharatcoder commited on
Commit
0c61b8c
·
verified ·
1 Parent(s): eba5d45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +740 -0
app.py CHANGED
@@ -404,6 +404,746 @@ def search_rs_studies(
404
  results = search_knowledge_base(query, num_results, source_filter, task_type)
405
  return json.dumps(results, indent=2)
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
  with gr.Blocks() as demo:
409
  gr.Markdown(
 
404
  results = search_knowledge_base(query, num_results, source_filter, task_type)
405
  return json.dumps(results, indent=2)
406
 
407
+ def get_rs_sources() -> str:
408
+ """
409
+ Get information about available data sources in the RS Studies knowledge base.
410
+
411
+ Returns:
412
+ JSON string with list of available sources, their statistics, and collection info
413
+ """
414
+ sources_info = get_available_sources()
415
+ return json.dumps(sources_info, indent=2)
416
+
417
+ def ask_rs_question(question: str, context_size: int = 3) -> str:
418
+ """
419
+ Ask a specific question about RS trading systems and get contextual answers.
420
+
421
+ This is a higher-level tool that searches for relevant information and
422
+ provides it in a question-answering format with ranked context.
423
+
424
+ Args:
425
+ question: Your question about RS systems, trading, or related topics
426
+ context_size: Number of relevant chunks to include in context (1-10, default: 3)
427
+
428
+ Returns:
429
+ JSON string with the question, relevant context chunks, and analysis
430
+ """
431
+ if not question or not question.strip():
432
+ return json.dumps({
433
+ "error": "Question cannot be empty",
434
+ "context": [],
435
+ "success": False
436
+ })
437
+
438
+ context_size = max(1, min(context_size, config.MAX_CONTEXT_SIZE))
439
+
440
+ # Search for relevant information using question task type
441
+ search_results = search_knowledge_base(question, context_size, task_type="question_answering")
442
+
443
+ if not search_results.get("success", False):
444
+ return json.dumps(search_results)
445
+
446
+ # Format as Q&A response
447
+ response = {
448
+ "question": question,
449
+ "context_chunks": len(search_results.get("results", [])),
450
+ "relevant_context": [],
451
+ "sources_used": set(),
452
+ "success": True
453
+ }
454
+
455
+ for i, result in enumerate(search_results.get("results", [])[:context_size]):
456
+ context_item = {
457
+ "rank": i + 1,
458
+ "content": result["content"],
459
+ "source": f"{result['source_folder']} (chunk {result['chunk_number']})",
460
+ "relevance_score": f"{result['similarity_score']:.3f}",
461
+ "chunk_file": result["chunk_file"]
462
+ }
463
+ response["relevant_context"].append(context_item)
464
+ response["sources_used"].add(result["source_folder"])
465
+
466
+ # Convert set to list for JSON serialization
467
+ response["sources_used"] = sorted(list(response["sources_used"]))
468
+
469
+ return json.dumps(response, indent=2)
470
+
471
+ def get_collection_info() -> str:
472
+ """
473
+ Get detailed information about the RS Studies knowledge base collection.
474
+
475
+ Returns:
476
+ JSON string with collection statistics, configuration, and metadata structure
477
+ """
478
+
479
+ try:
480
+ total_count = collection.count()
481
+
482
+ # Get sample of metadata to understand structure
483
+ sample_results = collection.get(limit=10, include=["metadatas"])
484
+
485
+ # Analyze metadata structure
486
+ metadata_keys = set()
487
+ for metadata in sample_results["metadatas"]:
488
+ metadata_keys.update(metadata.keys())
489
+
490
+ info = {
491
+ "collection_name": config.COLLECTION_NAME,
492
+ "total_documents": total_count,
493
+ "model_path": config.MODEL_PATH,
494
+ "device": device,
495
+ "metadata_structure": sorted(list(metadata_keys)),
496
+ "config": {
497
+ "max_results": config.MAX_NUM_RESULTS,
498
+ "valid_sources": config.VALID_SOURCES
499
+ },
500
+ "success": True
501
+ }
502
+ return json.dumps(info, indent=2)
503
+
504
+ except Exception as e:
505
+ return json.dumps({"error": f"Failed to get collection info: {str(e)}", "success": False})
506
+
507
+
508
+ def search_by_source(source_name: str, query: str = "", num_results: int = 10) -> str:
509
+ """
510
+ Browse or search within a specific data source.
511
+
512
+ Args:
513
+ source_name: Name of the source to search within (use get_rs_sources to see available sources)
514
+ query: Optional search query (if empty, returns recent chunks from the source)
515
+ num_results: Number of results to return (1-50, default: 10)
516
+
517
+ Returns:
518
+ JSON string with results from the specified source
519
+ """
520
+ if source_name not in config.VALID_SOURCES:
521
+ return json.dumps({
522
+ "error": f"Invalid source_name. Must be one of: {config.VALID_SOURCES}",
523
+ "results": [],
524
+ "success": False
525
+ })
526
+
527
+ num_results = max(1, min(num_results, config.MAX_NUM_RESULTS))
528
+
529
+ if query.strip():
530
+ # Search within the source
531
+ results = search_knowledge_base(query, num_results, source_name)
532
+ else:
533
+ # Browse the source (get recent chunks)
534
+ if not ensure_initialized():
535
+ return json.dumps({
536
+ "error": "Server not properly initialized",
537
+ "results": [],
538
+ "success": False
539
+ })
540
+
541
+ try:
542
+ source_results = collection.get(
543
+ where={"source_folder": {"$eq": source_name}},
544
+ limit=num_results,
545
+ include=["documents", "metadatas"]
546
+ )
547
+
548
+ formatted_results = []
549
+ for i, (doc, metadata) in enumerate(zip(source_results["documents"], source_results["metadatas"])):
550
+ result = {
551
+ "rank": i + 1,
552
+ "content": doc,
553
+ "source_folder": metadata.get("source_folder", "unknown"),
554
+ "chunk_file": metadata.get("chunk_file", "unknown"),
555
+ "chunk_number": metadata.get("chunk_number", "unknown"),
556
+ "chunk_length": metadata.get("chunk_length", 0),
557
+ "metadata": metadata
558
+ }
559
+ formatted_results.append(result)
560
+
561
+ results = {
562
+ "source_name": source_name,
563
+ "query": query or "(browsing mode)",
564
+ "num_results": len(formatted_results),
565
+ "results": formatted_results,
566
+ "success": True
567
+ }
568
+
569
+ except Exception as e:
570
+ results = {
571
+ "error": f"Failed to browse source: {str(e)}",
572
+ "results": [],
573
+ "success": False
574
+ }
575
+
576
+ return json.dumps(results, indent=2)
577
+
578
+ def verify_fact_rs(statement: str, num_evidence: int = 3) -> str:
579
+ """
580
+ Verify a fact or statement against the RS Studies knowledge base.
581
+
582
+ This tool uses EmbeddingGemma's fact checking optimization to find evidence
583
+ that supports or contradicts the given statement.
584
+
585
+ Args:
586
+ statement: The statement or claim to verify
587
+ num_evidence: Number of evidence chunks to return (1-10, default: 3)
588
+
589
+ Returns:
590
+ JSON string with evidence chunks ranked by relevance to the fact claim
591
+ """
592
+ if not statement or not statement.strip():
593
+ return json.dumps({
594
+ "error": "Statement cannot be empty",
595
+ "evidence": [],
596
+ "success": False
597
+ })
598
+
599
+ num_evidence = max(1, min(num_evidence, config.MAX_CONTEXT_SIZE))
600
+
601
+ # Search for evidence using fact checking optimization
602
+ search_results = search_knowledge_base(statement, num_evidence, task_type="fact_checking")
603
+
604
+ if not search_results.get("success", False):
605
+ return json.dumps(search_results)
606
+
607
+ # Format as fact verification response
608
+ response = {
609
+ "statement": statement,
610
+ "evidence_count": len(search_results.get("results", [])),
611
+ "evidence": [],
612
+ "sources_consulted": set(),
613
+ "success": True
614
+ }
615
+
616
+ for i, result in enumerate(search_results.get("results", [])):
617
+ evidence_item = {
618
+ "rank": i + 1,
619
+ "content": result["content"],
620
+ "source": f"{result['source_folder']} (chunk {result['chunk_number']})",
621
+ "relevance_score": f"{result['similarity_score']:.3f}",
622
+ "chunk_file": result["chunk_file"]
623
+ }
624
+ response["evidence"].append(evidence_item)
625
+ response["sources_consulted"].add(result["source_folder"])
626
+
627
+ # Convert set to list for JSON serialization
628
+ response["sources_consulted"] = sorted(list(response["sources_consulted"]))
629
+
630
+ return json.dumps(response, indent=2)
631
+
632
+
633
+ def compare_similarity_rs(text1: str, text2: str, context_size: int = 5) -> str:
634
+ """
635
+ Compare semantic similarity between two concepts in the RS Studies context.
636
+
637
+ This tool finds content related to both concepts and assesses their relationship
638
+ using EmbeddingGemma's semantic similarity optimization.
639
+
640
+ Args:
641
+ text1: First concept, topic, or text to compare
642
+ text2: Second concept, topic, or text to compare
643
+ context_size: Number of relevant chunks to analyze for each concept (1-10, default: 5)
644
+
645
+ Returns:
646
+ JSON string with related content for both concepts and similarity analysis
647
+ """
648
+ if not text1 or not text1.strip() or not text2 or not text2.strip():
649
+ return json.dumps({
650
+ "error": "Both text1 and text2 must be provided",
651
+ "analysis": {},
652
+ "success": False
653
+ })
654
+
655
+ context_size = max(1, min(context_size, config.MAX_CONTEXT_SIZE))
656
+
657
+ # Search for content related to each concept using semantic similarity optimization
658
+ results1 = search_knowledge_base(text1, context_size, task_type="semantic_similarity")
659
+ results2 = search_knowledge_base(text2, context_size, task_type="semantic_similarity")
660
+
661
+ if not results1.get("success", False) or not results2.get("success", False):
662
+ return json.dumps({
663
+ "error": "Failed to search for one or both concepts",
664
+ "analysis": {},
665
+ "success": False
666
+ })
667
+
668
+ # Analyze overlap and differences
669
+ sources1 = set(r["source_folder"] for r in results1.get("results", []))
670
+ sources2 = set(r["source_folder"] for r in results2.get("results", []))
671
+
672
+ response = {
673
+ "concept1": text1,
674
+ "concept2": text2,
675
+ "concept1_results": len(results1.get("results", [])),
676
+ "concept2_results": len(results2.get("results", [])),
677
+ "shared_sources": sorted(list(sources1.intersection(sources2))),
678
+ "concept1_unique_sources": sorted(list(sources1 - sources2)),
679
+ "concept2_unique_sources": sorted(list(sources2 - sources1)),
680
+ "concept1_context": [
681
+ {
682
+ "rank": i + 1,
683
+ "content": r["content"][:200] + "..." if len(r["content"]) > 200 else r["content"],
684
+ "source": f"{r['source_folder']} (chunk {r['chunk_number']})",
685
+ "relevance": f"{r['similarity_score']:.3f}"
686
+ }
687
+ for i, r in enumerate(results1.get("results", []))
688
+ ],
689
+ "concept2_context": [
690
+ {
691
+ "rank": i + 1,
692
+ "content": r["content"][:200] + "..." if len(r["content"]) > 200 else r["content"],
693
+ "source": f"{r['source_folder']} (chunk {r['chunk_number']})",
694
+ "relevance": f"{r['similarity_score']:.3f}"
695
+ }
696
+ for i, r in enumerate(results2.get("results", []))
697
+ ],
698
+ "success": True
699
+ }
700
+
701
+ return json.dumps(response, indent=2)
702
+
703
+
704
+ def classify_content_rs(content: str, categories: List[str] = None) -> str:
705
+ """
706
+ Classify content against RS Studies knowledge categories.
707
+
708
+ Uses EmbeddingGemma's classification optimization to categorize content
709
+ based on the RS Studies knowledge base.
710
+
711
+ Args:
712
+ content: Text content to classify
713
+ categories: Optional list of specific categories to check against
714
+ (defaults to major RS topics)
715
+
716
+ Returns:
717
+ JSON string with classification results and supporting evidence
718
+ """
719
+ if not content or not content.strip():
720
+ return json.dumps({
721
+ "error": "Content cannot be empty",
722
+ "classification": {},
723
+ "success": False
724
+ })
725
+
726
+ # Default categories based on RS Studies sources
727
+ if categories is None:
728
+ categories = [
729
+ "trading systems",
730
+ "market analysis",
731
+ "Chennai meetup discussions",
732
+ "Q&A topics",
733
+ "technical strategies"
734
+ ]
735
+
736
+ # Search for similar content using classification optimization
737
+ search_results = search_knowledge_base(content, 8, task_type="classification")
738
+
739
+ if not search_results.get("success", False):
740
+ return json.dumps(search_results)
741
+
742
+ # Analyze which categories the content best fits
743
+ source_distribution = {}
744
+ for result in search_results.get("results", []):
745
+ source = result["source_folder"]
746
+ if source not in source_distribution:
747
+ source_distribution[source] = []
748
+ source_distribution[source].append({
749
+ "content": result["content"][:150] + "..." if len(result["content"]) > 150 else result["content"],
750
+ "similarity": result["similarity_score"]
751
+ })
752
+
753
+ response = {
754
+ "content": content[:200] + "..." if len(content) > 200 else content,
755
+ "available_categories": categories,
756
+ "source_distribution": source_distribution,
757
+ "top_matches": [
758
+ {
759
+ "rank": i + 1,
760
+ "content": r["content"][:150] + "..." if len(r["content"]) > 150 else r["content"],
761
+ "source_category": r["source_folder"],
762
+ "similarity_score": f"{r['similarity_score']:.3f}"
763
+ }
764
+ for i, r in enumerate(search_results.get("results", [])[:5])
765
+ ],
766
+ "success": True
767
+ }
768
+
769
+ return json.dumps(response, indent=2)
770
+
771
+ # ==================================================
772
+ # QnA-ENHANCED EMBEDDING TOOLS
773
+ # ==================================================
774
+
775
+ def search_by_embedding_type(
776
+ query: str,
777
+ embedding_type: str = "content",
778
+ num_results: int = 5,
779
+ source_filter: Optional[str] = None
780
+ ) -> str:
781
+ """
782
+ Search the knowledge base using specific embedding types for optimized retrieval.
783
+
784
+ This tool leverages the QnA-enhanced embeddings to provide targeted search
785
+ based on different content representations of the same chunks.
786
+
787
+ Args:
788
+ query: Your search question or topic (required)
789
+ embedding_type: Type of embedding to search:
790
+ - 'content': Original chunk content (default)
791
+ - 'enhanced_content': Content enhanced with QnA context
792
+ - 'questions': Questions-only embeddings for question matching
793
+ - 'answers': Answers-only embeddings for factual retrieval
794
+ num_results: Number of results to return (1-50, default: 5)
795
+ source_filter: Limit to specific source folder (optional)
796
+
797
+ Returns:
798
+ JSON string with search results optimized for the specified embedding type
799
+ """
800
+
801
+ # Validate parameters
802
+ if not query or not query.strip():
803
+ return json.dumps({"error": "Query cannot be empty", "results": [], "success": False})
804
+
805
+ valid_embedding_types = ["content", "enhanced_content", "questions", "answers"]
806
+ if embedding_type not in valid_embedding_types:
807
+ return json.dumps({
808
+ "error": f"Invalid embedding_type. Must be one of: {valid_embedding_types}",
809
+ "results": [],
810
+ "success": False
811
+ })
812
+
813
+ num_results = max(1, min(num_results, config.MAX_NUM_RESULTS))
814
+
815
+ try:
816
+ # Format query appropriately based on embedding type
817
+ if embedding_type == "questions":
818
+ formatted_query = EmbeddingGemmaPrompts.encode_query(query, "question_answering")
819
+ elif embedding_type == "answers":
820
+ formatted_query = EmbeddingGemmaPrompts.encode_query(query, "fact_checking")
821
+ else:
822
+ formatted_query = EmbeddingGemmaPrompts.encode_query(query, "search")
823
+
824
+ # Create query embedding
825
+ query_embedding = model.encode([formatted_query], device=device)
826
+
827
+ # Build where clause to filter by embedding type
828
+ where_clause = {"embedding_type": embedding_type}
829
+ if source_filter:
830
+ where_clause["source_folder"] = source_filter
831
+
832
+ # Query ChromaDB
833
+ search_results = collection.query(
834
+ query_embeddings=query_embedding.tolist(),
835
+ n_results=num_results,
836
+ where=where_clause
837
+ )
838
+
839
+ # Format results
840
+ results = []
841
+ for i, (doc, metadata, distance) in enumerate(zip(
842
+ search_results['documents'][0],
843
+ search_results['metadatas'][0],
844
+ search_results['distances'][0]
845
+ )):
846
+ results.append({
847
+ "rank": i + 1,
848
+ "content": doc,
849
+ "similarity_score": 1 - distance,
850
+ "embedding_type": metadata.get("embedding_type", "unknown"),
851
+ "enhanced": metadata.get("enhanced", False),
852
+ "qna_count": metadata.get("qna_count", 0),
853
+ "source_folder": metadata.get("source_folder", "unknown"),
854
+ "chunk_number": metadata.get("chunk_number", "unknown"),
855
+ "chunk_file": metadata.get("chunk_file", "unknown")
856
+ })
857
+
858
+ return json.dumps({
859
+ "query": query,
860
+ "embedding_type": embedding_type,
861
+ "results_found": len(results),
862
+ "source_filter": source_filter,
863
+ "results": results,
864
+ "success": True
865
+ }, indent=2)
866
+
867
+ except Exception as e:
868
+ return json.dumps({
869
+ "error": f"Search failed: {str(e)}",
870
+ "query": query,
871
+ "embedding_type": embedding_type,
872
+ "results": [],
873
+ "success": False
874
+ })
875
+
876
+
877
+ def smart_multi_search(
878
+ query: str,
879
+ num_results_per_type: int = 3,
880
+ source_filter: Optional[str] = None,
881
+ combine_strategy: str = "best_of_each"
882
+ ) -> str:
883
+ """
884
+ Perform intelligent multi-type search across different embedding types.
885
+
886
+ This tool searches across multiple embedding types and combines results
887
+ to provide comprehensive coverage of relevant information.
888
+
889
+ Args:
890
+ query: Your search question or topic (required)
891
+ num_results_per_type: Results per embedding type (1-10, default: 3)
892
+ source_filter: Limit to specific source folder (optional)
893
+ combine_strategy: How to combine results:
894
+ - 'best_of_each': Top results from each type
895
+ - 'relevance_ranked': All results ranked by similarity
896
+ - 'type_weighted': Weighted by embedding type appropriateness
897
+
898
+ Returns:
899
+ JSON string with combined search results and analysis
900
+ """
901
+ if not query or not query.strip():
902
+ return json.dumps({"error": "Query cannot be empty", "results": [], "success": False})
903
+
904
+ num_results_per_type = max(1, min(num_results_per_type, 10))
905
+
906
+ try:
907
+ all_results = {}
908
+ embedding_types = ["content", "enhanced_content", "questions", "answers"]
909
+
910
+ # Search each embedding type
911
+ for emb_type in embedding_types:
912
+ search_result = search_by_embedding_type(
913
+ query, emb_type, num_results_per_type, source_filter
914
+ )
915
+ result_data = json.loads(search_result)
916
+ if result_data.get("success", False):
917
+ all_results[emb_type] = result_data["results"]
918
+ else:
919
+ all_results[emb_type] = []
920
+
921
+ # Combine results based on strategy
922
+ combined_results = []
923
+
924
+ if combine_strategy == "best_of_each":
925
+ # Take top result from each type
926
+ for emb_type, results in all_results.items():
927
+ for result in results:
928
+ result["search_type"] = emb_type
929
+ combined_results.append(result)
930
+
931
+ elif combine_strategy == "relevance_ranked":
932
+ # Combine all and sort by similarity
933
+ for emb_type, results in all_results.items():
934
+ for result in results:
935
+ result["search_type"] = emb_type
936
+ combined_results.append(result)
937
+ combined_results.sort(key=lambda x: x["similarity_score"], reverse=True)
938
+
939
+ elif combine_strategy == "type_weighted":
940
+ # Apply weights based on query type analysis
941
+ query_lower = query.lower()
942
+
943
+ # Simple heuristics for weighting
944
+ weights = {
945
+ "content": 1.0,
946
+ "enhanced_content": 1.2, # Slightly favor enhanced
947
+ "questions": 1.5 if any(word in query_lower for word in ["what", "how", "why", "when", "where", "?"]) else 0.8,
948
+ "answers": 1.3 if any(word in query_lower for word in ["define", "explain", "meaning", "is"]) else 0.9
949
+ }
950
+
951
+ for emb_type, results in all_results.items():
952
+ for result in results:
953
+ result["search_type"] = emb_type
954
+ result["weighted_score"] = result["similarity_score"] * weights[emb_type]
955
+ combined_results.append(result)
956
+
957
+ combined_results.sort(key=lambda x: x["weighted_score"], reverse=True)
958
+
959
+ # Deduplicate by chunk (keep best scoring version)
960
+ seen_chunks = {}
961
+ final_results = []
962
+
963
+ for result in combined_results:
964
+ chunk_key = f"{result['source_folder']}_chunk_{result['chunk_number']}"
965
+ if chunk_key not in seen_chunks or result["similarity_score"] > seen_chunks[chunk_key]["similarity_score"]:
966
+ seen_chunks[chunk_key] = result
967
+
968
+ final_results = list(seen_chunks.values())
969
+ final_results.sort(key=lambda x: x.get("weighted_score", x["similarity_score"]), reverse=True)
970
+
971
+ # Add ranking
972
+ for i, result in enumerate(final_results):
973
+ result["final_rank"] = i + 1
974
+
975
+ return json.dumps({
976
+ "query": query,
977
+ "combine_strategy": combine_strategy,
978
+ "total_results": len(final_results),
979
+ "embedding_types_searched": embedding_types,
980
+ "results_per_type": {emb_type: len(results) for emb_type, results in all_results.items()},
981
+ "source_filter": source_filter,
982
+ "results": final_results[:num_results_per_type * 2], # Limit final output
983
+ "success": True
984
+ }, indent=2)
985
+
986
+ except Exception as e:
987
+ return json.dumps({
988
+ "error": f"Multi-search failed: {str(e)}",
989
+ "query": query,
990
+ "results": [],
991
+ "success": False
992
+ })
993
+
994
+ def analyze_embedding_coverage(source_filter: Optional[str] = None) -> str:
995
+ """
996
+ Analyze the distribution and coverage of different embedding types in the knowledge base.
997
+
998
+ Args:
999
+ source_filter: Limit analysis to specific source folder (optional)
1000
+
1001
+ Returns:
1002
+ JSON string with embedding type statistics and coverage analysis
1003
+ """
1004
+ try:
1005
+ # Build where clause
1006
+ where_clause = {}
1007
+ if source_filter:
1008
+ where_clause["source_folder"] = source_filter
1009
+
1010
+ # Get all documents with metadata
1011
+ if where_clause:
1012
+ all_docs = collection.get(where=where_clause)
1013
+ else:
1014
+ all_docs = collection.get()
1015
+
1016
+ # Analyze embedding types
1017
+ type_counts = {}
1018
+ enhanced_counts = {"enhanced": 0, "original": 0}
1019
+ source_breakdown = {}
1020
+ qna_stats = {"with_qna": 0, "without_qna": 0}
1021
+
1022
+ for metadata in all_docs['metadatas']:
1023
+ emb_type = metadata.get('embedding_type', 'unknown')
1024
+ type_counts[emb_type] = type_counts.get(emb_type, 0) + 1
1025
+
1026
+ # Enhanced vs original
1027
+ if metadata.get('enhanced', False):
1028
+ enhanced_counts["enhanced"] += 1
1029
+ else:
1030
+ enhanced_counts["original"] += 1
1031
+
1032
+ # Source breakdown
1033
+ source = metadata.get('source_folder', 'unknown')
1034
+ if source not in source_breakdown:
1035
+ source_breakdown[source] = {}
1036
+ source_breakdown[source][emb_type] = source_breakdown[source].get(emb_type, 0) + 1
1037
+
1038
+ # QnA statistics
1039
+ if metadata.get('qna_count', 0) > 0:
1040
+ qna_stats["with_qna"] += 1
1041
+ else:
1042
+ qna_stats["without_qna"] += 1
1043
+
1044
+ total_embeddings = len(all_docs['metadatas'])
1045
+
1046
+ return json.dumps({
1047
+ "total_embeddings": total_embeddings,
1048
+ "source_filter": source_filter,
1049
+ "embedding_type_distribution": type_counts,
1050
+ "enhancement_status": enhanced_counts,
1051
+ "qna_coverage": qna_stats,
1052
+ "source_breakdown": source_breakdown,
1053
+ "coverage_percentage": {
1054
+ emb_type: round((count / total_embeddings) * 100, 2)
1055
+ for emb_type, count in type_counts.items()
1056
+ },
1057
+ "success": True
1058
+ }, indent=2)
1059
+
1060
+ except Exception as e:
1061
+ return json.dumps({
1062
+ "error": f"Analysis failed: {str(e)}",
1063
+ "analysis": {},
1064
+ "success": False
1065
+ })
1066
+
1067
+ def find_related_questions(
1068
+ topic: str,
1069
+ num_questions: int = 5,
1070
+ source_filter: Optional[str] = None
1071
+ ) -> str:
1072
+ """
1073
+ Find questions related to a specific topic using the questions-only embeddings.
1074
+
1075
+ This tool is optimized for discovering what questions are available about
1076
+ a topic, useful for exploration and understanding coverage.
1077
+
1078
+ Args:
1079
+ topic: Topic or concept to find questions about (required)
1080
+ num_questions: Number of related questions to return (1-20, default: 5)
1081
+ source_filter: Limit to specific source folder (optional)
1082
+
1083
+ Returns:
1084
+ JSON string with related questions and their context
1085
+ """
1086
+ if not topic or not topic.strip():
1087
+ return json.dumps({"error": "Topic cannot be empty", "questions": [], "success": False})
1088
+
1089
+ num_questions = max(1, min(num_questions, 20))
1090
+
1091
+ try:
1092
+ # Search using questions-only embeddings
1093
+ question_search = search_by_embedding_type(
1094
+ topic, "questions", num_questions, source_filter
1095
+ )
1096
+
1097
+ search_data = json.loads(question_search)
1098
+
1099
+ if not search_data.get("success", False):
1100
+ return json.dumps({
1101
+ "error": "Failed to search questions",
1102
+ "topic": topic,
1103
+ "questions": [],
1104
+ "success": False
1105
+ })
1106
+
1107
+ # Extract questions and add context
1108
+ questions = []
1109
+ for result in search_data["results"]:
1110
+ # Parse the questions from the content (format: "Q1 | Q2 | Q3")
1111
+ question_list = [q.strip() for q in result["content"].split("|")]
1112
+
1113
+ for question in question_list:
1114
+ if question: # Skip empty questions
1115
+ questions.append({
1116
+ "question": question,
1117
+ "relevance_score": result["similarity_score"],
1118
+ "source": f"{result['source_folder']} (chunk {result['chunk_number']})",
1119
+ "chunk_file": result["chunk_file"],
1120
+ "qna_count": result.get("qna_count", 0)
1121
+ })
1122
+
1123
+ # Sort by relevance and limit
1124
+ questions.sort(key=lambda x: x["relevance_score"], reverse=True)
1125
+ questions = questions[:num_questions]
1126
+
1127
+ # Add ranking
1128
+ for i, q in enumerate(questions):
1129
+ q["rank"] = i + 1
1130
+
1131
+ return json.dumps({
1132
+ "topic": topic,
1133
+ "total_questions_found": len(questions),
1134
+ "source_filter": source_filter,
1135
+ "questions": questions,
1136
+ "success": True
1137
+ }, indent=2)
1138
+
1139
+ except Exception as e:
1140
+ return json.dumps({
1141
+ "error": f"Question search failed: {str(e)}",
1142
+ "topic": topic,
1143
+ "questions": [],
1144
+ "success": False
1145
+ })
1146
+
1147
 
1148
  with gr.Blocks() as demo:
1149
  gr.Markdown(