vikramvasudevan commited on
Commit
6193938
·
verified ·
1 Parent(s): 96fa70b

Upload folder using huggingface_hub

Browse files
data/dropbox/scriptures/sri_stavam/video_metadata.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "scripture": "sri_stavam",
4
+ "global_index": 2,
5
+ "video_url": "https://www.youtube.com/watch?v=u2MUPrE7Zx8",
6
+ "type": "upanyasam"
7
+ },
8
+ {
9
+ "scripture": "sri_stavam",
10
+ "global_index": 3,
11
+ "video_url": "https://www.youtube.com/watch?v=s-aIwjzmEbs",
12
+ "type": "upanyasam"
13
+ },
14
+ {
15
+ "scripture": "sri_stavam",
16
+ "global_index": 4,
17
+ "video_url": "https://www.youtube.com/watch?v=WjOfN-GiZXE",
18
+ "type": "upanyasam"
19
+ },
20
+ {
21
+ "scripture": "sri_stavam",
22
+ "global_index": 5,
23
+ "video_url": "https://www.youtube.com/watch?v=ivPWtir9JGY",
24
+ "type": "upanyasam"
25
+ },
26
+ {
27
+ "scripture": "sri_stavam",
28
+ "global_index": 6,
29
+ "video_url": "https://www.youtube.com/watch?v=_QBqGwpjOO0",
30
+ "type": "upanyasam"
31
+ },
32
+ {
33
+ "scripture": "sri_stavam",
34
+ "global_index": 7,
35
+ "video_url": "https://www.youtube.com/watch?v=5K36X0FhY8Y",
36
+ "type": "upanyasam"
37
+ },
38
+ {
39
+ "scripture": "sri_stavam",
40
+ "global_index": 8,
41
+ "video_url": "https://www.youtube.com/watch?v=FLhSyUkMPGw",
42
+ "type": "upanyasam"
43
+ },
44
+ {
45
+ "scripture": "sri_stavam",
46
+ "global_index": 9,
47
+ "video_url": "https://www.youtube.com/watch?v=q-Vf7T1CPME",
48
+ "type": "upanyasam"
49
+ },
50
+ {
51
+ "scripture": "sri_stavam",
52
+ "global_index": 10,
53
+ "video_url": "https://www.youtube.com/watch?v=TEpI4koRvRI",
54
+ "type": "upanyasam"
55
+ },
56
+ {
57
+ "scripture": "sri_stavam",
58
+ "global_index": 11,
59
+ "video_url": "https://www.youtube.com/watch?v=9ievpAujqnw",
60
+ "type": "upanyasam"
61
+ }
62
+ ]
db.py CHANGED
@@ -38,6 +38,10 @@ class SanatanDatabase:
38
  metadatas=metadatas,
39
  )
40
 
 
 
 
 
41
  def fetch_random_data(
42
  self,
43
  collection_name: str,
@@ -77,7 +81,7 @@ class SanatanDatabase:
77
  metadatas=[metas[i] for i in indices],
78
  )
79
 
80
- def fetch_first_match(
81
  self, collection_name: str, metadata_where_clause: MetadataWhereClause = None
82
  ):
83
  """This version is created to support the browse module with fallback regex matching"""
@@ -87,6 +91,7 @@ class SanatanDatabase:
87
  s = unicodedata.normalize("NFD", s)
88
  s = "".join(ch for ch in s if not unicodedata.combining(ch))
89
  return s
 
90
  logger.info(
91
  "getting first matching verses from [%s] | metadata_where_clause = %s",
92
  collection_name,
@@ -122,7 +127,8 @@ class SanatanDatabase:
122
 
123
  # find filters with $eq string type
124
  regex_filters = [
125
- f for f in metadata_where_clause.filters
 
126
  if f.metadata_search_operator == "$eq" and isinstance(f.metadata_value, str)
127
  ]
128
 
@@ -137,7 +143,7 @@ class SanatanDatabase:
137
  ok = True
138
  for f in regex_filters:
139
  field_val = str(meta.get(f.metadata_field, ""))
140
-
141
  # Normalize both the stored field and the search value
142
  norm_val = normalize_for_match(field_val)
143
  norm_query = normalize_for_match(f.metadata_value)
@@ -149,7 +155,6 @@ class SanatanDatabase:
149
  if ok:
150
  matched_indices.append(i)
151
 
152
-
153
  if not matched_indices:
154
  logger.warning("Regex fallback also found no matches.")
155
  return chromadb.GetResult(ids=[], documents=[], metadatas=[])
@@ -715,4 +720,4 @@ class SanatanDatabase:
715
  def delete_taniyans_in_divya_prabandham(self):
716
  nalayiram_helper.delete_taniyan(
717
  self.chroma_client.get_collection("divya_prabandham")
718
- )
 
38
  metadatas=metadatas,
39
  )
40
 
41
+ def get(self, collection_name: str, where, n_results=5):
42
+ collection = self.chroma_client.get_or_create_collection(name=collection_name)
43
+ return collection.get(where=where, limit=n_results)
44
+
45
  def fetch_random_data(
46
  self,
47
  collection_name: str,
 
81
  metadatas=[metas[i] for i in indices],
82
  )
83
 
84
+ def fetch_first_match(
85
  self, collection_name: str, metadata_where_clause: MetadataWhereClause = None
86
  ):
87
  """This version is created to support the browse module with fallback regex matching"""
 
91
  s = unicodedata.normalize("NFD", s)
92
  s = "".join(ch for ch in s if not unicodedata.combining(ch))
93
  return s
94
+
95
  logger.info(
96
  "getting first matching verses from [%s] | metadata_where_clause = %s",
97
  collection_name,
 
127
 
128
  # find filters with $eq string type
129
  regex_filters = [
130
+ f
131
+ for f in metadata_where_clause.filters
132
  if f.metadata_search_operator == "$eq" and isinstance(f.metadata_value, str)
133
  ]
134
 
 
143
  ok = True
144
  for f in regex_filters:
145
  field_val = str(meta.get(f.metadata_field, ""))
146
+
147
  # Normalize both the stored field and the search value
148
  norm_val = normalize_for_match(field_val)
149
  norm_query = normalize_for_match(f.metadata_value)
 
155
  if ok:
156
  matched_indices.append(i)
157
 
 
158
  if not matched_indices:
159
  logger.warning("Regex fallback also found no matches.")
160
  return chromadb.GetResult(ids=[], documents=[], metadatas=[])
 
720
  def delete_taniyans_in_divya_prabandham(self):
721
  nalayiram_helper.delete_taniyan(
722
  self.chroma_client.get_collection("divya_prabandham")
723
+ )
tests/test_yt_search.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+
4
+ from db import SanatanDatabase
5
+ from metadata import MetadataFilter, MetadataWhereClause
6
+
7
+ if __name__ == "__main__":
8
+ logging.basicConfig()
9
+ collection_name = "yt_metadata"
10
+ database = SanatanDatabase()
11
+ print("count = ", database.count(collection_name))
12
+ response = database.fetch_first_match(
13
+ collection_name=collection_name,
14
+ metadata_where_clause=MetadataWhereClause(filters=[
15
+ # MetadataFilter(metadata_field="channel_id", metadata_search_operator="$eq", metadata_value="UCqa48rNanVRKmG4qxl-YmEQ"),
16
+ MetadataFilter(metadata_field="channel_title", metadata_search_operator="$eq", metadata_value="sookthi"),
17
+ MetadataFilter(metadata_field="video_title", metadata_search_operator="$eq", metadata_value="krishna mangalam")
18
+
19
+ ])
20
+ )
21
+ print(json.dumps(response, indent=1, ensure_ascii=False))