huytrao123 commited on
Commit
c3bf6d5
·
verified ·
1 Parent(s): 6fa8f08

Update src/Indexingstep/indexing_pipeline.py

Browse files
Files changed (1) hide show
  1. src/Indexingstep/indexing_pipeline.py +110 -110
src/Indexingstep/indexing_pipeline.py CHANGED
@@ -1,110 +1,110 @@
1
- import os
2
- import sys
3
- from typing import List, Dict, Any
4
- from datetime import datetime
5
-
6
- # Add parent directory to path
7
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
-
9
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
10
- from langchain_chroma import Chroma
11
- from langchain.schema import Document
12
- from langchain.text_splitter import RecursiveCharacterTextSplitter
13
-
14
- def create_user_vector_database(user_id: int, diary_entries: List[Dict[str, Any]]) -> bool:
15
- """
16
- Create vector database for a specific user from their diary entries.
17
-
18
- Args:
19
- user_id: User ID
20
- diary_entries: List of diary entries from database
21
-
22
- Returns:
23
- True if successful, False otherwise
24
- """
25
- try:
26
- # Setup paths
27
- base_vector_path = os.path.dirname(os.path.abspath(__file__))
28
- vector_db_path = os.path.join(base_vector_path, f"user_{user_id}_vector_db")
29
- collection_name = f"user_{user_id}_diary_entries"
30
-
31
- # Create directory
32
- os.makedirs(vector_db_path, exist_ok=True)
33
-
34
- # Initialize embeddings
35
- google_api_key = os.getenv("GOOGLE_API_KEY")
36
- if not google_api_key:
37
- raise ValueError("Google API key not found")
38
-
39
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
40
-
41
- # Process diary entries into documents
42
- documents = []
43
- text_splitter = RecursiveCharacterTextSplitter(
44
- chunk_size=1000,
45
- chunk_overlap=200,
46
- length_function=len,
47
- )
48
-
49
- for entry in diary_entries:
50
- # Extract content
51
- content = entry.get('content', '')
52
- if not content:
53
- continue
54
-
55
- # Extract title and content
56
- lines = content.split('\n')
57
- title = "Untitled"
58
- actual_content = content
59
-
60
- for line in lines:
61
- if line.startswith('Title: '):
62
- title = line.replace('Title: ', '').strip()
63
- elif line.startswith('Content: '):
64
- actual_content = line.replace('Content: ', '').strip()
65
- break
66
-
67
- # Create metadata
68
- metadata = {
69
- 'user_id': user_id,
70
- 'entry_id': entry.get('id'),
71
- 'date': entry.get('date', ''),
72
- 'title': title,
73
- 'tags': entry.get('tags', ''),
74
- 'tags_list': [tag.strip() for tag in entry.get('tags', '').split(',') if tag.strip()],
75
- 'source': f"diary_entry_{entry.get('id')}"
76
- }
77
-
78
- # Split content if too long
79
- if len(actual_content) > 1000:
80
- chunks = text_splitter.split_text(actual_content)
81
- for i, chunk in enumerate(chunks):
82
- chunk_metadata = metadata.copy()
83
- chunk_metadata['chunk_id'] = i
84
- documents.append(Document(page_content=chunk, metadata=chunk_metadata))
85
- else:
86
- documents.append(Document(page_content=actual_content, metadata=metadata))
87
-
88
- if not documents:
89
- print(f"No documents to index for user {user_id}")
90
- return False
91
-
92
- # Create vector store
93
- vector_store = Chroma(
94
- persist_directory=vector_db_path,
95
- embedding_function=embeddings,
96
- collection_name=collection_name
97
- )
98
-
99
- # Add documents to vector store
100
- vector_store.add_documents(documents)
101
-
102
- # Persist the database
103
- vector_store.persist()
104
-
105
- print(f"Successfully created vector database for user {user_id} with {len(documents)} documents")
106
- return True
107
-
108
- except Exception as e:
109
- print(f"Error creating vector database for user {user_id}: {e}")
110
- return False
 
1
+ import os
2
+ import sys
3
+ from typing import List, Dict, Any
4
+ from datetime import datetime
5
+
6
+ # Add parent directory to path
7
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
+
9
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
10
+ from langchain.vectorstores import Chroma
11
+ from langchain.schema import Document
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+
14
+ def create_user_vector_database(user_id: int, diary_entries: List[Dict[str, Any]]) -> bool:
15
+ """
16
+ Create vector database for a specific user from their diary entries.
17
+
18
+ Args:
19
+ user_id: User ID
20
+ diary_entries: List of diary entries from database
21
+
22
+ Returns:
23
+ True if successful, False otherwise
24
+ """
25
+ try:
26
+ # Setup paths
27
+ base_vector_path = os.path.dirname(os.path.abspath(__file__))
28
+ vector_db_path = os.path.join(base_vector_path, f"user_{user_id}_vector_db")
29
+ collection_name = f"user_{user_id}_diary_entries"
30
+
31
+ # Create directory
32
+ os.makedirs(vector_db_path, exist_ok=True)
33
+
34
+ # Initialize embeddings
35
+ google_api_key = os.getenv("GOOGLE_API_KEY")
36
+ if not google_api_key:
37
+ raise ValueError("Google API key not found")
38
+
39
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
40
+
41
+ # Process diary entries into documents
42
+ documents = []
43
+ text_splitter = RecursiveCharacterTextSplitter(
44
+ chunk_size=1000,
45
+ chunk_overlap=200,
46
+ length_function=len,
47
+ )
48
+
49
+ for entry in diary_entries:
50
+ # Extract content
51
+ content = entry.get('content', '')
52
+ if not content:
53
+ continue
54
+
55
+ # Extract title and content
56
+ lines = content.split('\n')
57
+ title = "Untitled"
58
+ actual_content = content
59
+
60
+ for line in lines:
61
+ if line.startswith('Title: '):
62
+ title = line.replace('Title: ', '').strip()
63
+ elif line.startswith('Content: '):
64
+ actual_content = line.replace('Content: ', '').strip()
65
+ break
66
+
67
+ # Create metadata
68
+ metadata = {
69
+ 'user_id': user_id,
70
+ 'entry_id': entry.get('id'),
71
+ 'date': entry.get('date', ''),
72
+ 'title': title,
73
+ 'tags': entry.get('tags', ''),
74
+ 'tags_list': [tag.strip() for tag in entry.get('tags', '').split(',') if tag.strip()],
75
+ 'source': f"diary_entry_{entry.get('id')}"
76
+ }
77
+
78
+ # Split content if too long
79
+ if len(actual_content) > 1000:
80
+ chunks = text_splitter.split_text(actual_content)
81
+ for i, chunk in enumerate(chunks):
82
+ chunk_metadata = metadata.copy()
83
+ chunk_metadata['chunk_id'] = i
84
+ documents.append(Document(page_content=chunk, metadata=chunk_metadata))
85
+ else:
86
+ documents.append(Document(page_content=actual_content, metadata=metadata))
87
+
88
+ if not documents:
89
+ print(f"No documents to index for user {user_id}")
90
+ return False
91
+
92
+ # Create vector store
93
+ vector_store = Chroma(
94
+ persist_directory=vector_db_path,
95
+ embedding_function=embeddings,
96
+ collection_name=collection_name
97
+ )
98
+
99
+ # Add documents to vector store
100
+ vector_store.add_documents(documents)
101
+
102
+ # Persist the database
103
+ vector_store.persist()
104
+
105
+ print(f"Successfully created vector database for user {user_id} with {len(documents)} documents")
106
+ return True
107
+
108
+ except Exception as e:
109
+ print(f"Error creating vector database for user {user_id}: {e}")
110
+ return False