deveshm8 commited on
Commit
b9737c3
·
1 Parent(s): 2f5c3de

create embeddings of ppt and store metadata of files in tables

Browse files
migrations/versions/{c649dcba2de3_create_tables.py → 3292a4761b48_files_and_vector_records.py} RENAMED
@@ -1,8 +1,8 @@
1
- """create_tables
2
 
3
- Revision ID: c649dcba2de3
4
  Revises: 8652e8501339
5
- Create Date: 2024-10-25 14:28:36.463641
6
 
7
  """
8
  from typing import Sequence, Union
@@ -12,7 +12,7 @@ import sqlalchemy as sa
12
 
13
 
14
  # revision identifiers, used by Alembic.
15
- revision: str = 'c649dcba2de3'
16
  down_revision: Union[str, None] = '8652e8501339'
17
  branch_labels: Union[str, Sequence[str], None] = None
18
  depends_on: Union[str, Sequence[str], None] = None
@@ -20,11 +20,29 @@ depends_on: Union[str, Sequence[str], None] = None
20
 
21
  def upgrade() -> None:
22
  # ### commands auto generated by Alembic - please adjust! ###
23
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # ### end Alembic commands ###
25
 
26
 
27
  def downgrade() -> None:
28
  # ### commands auto generated by Alembic - please adjust! ###
29
- pass
 
30
  # ### end Alembic commands ###
 
1
+ """files_and_vector_records
2
 
3
+ Revision ID: 3292a4761b48
4
  Revises: 8652e8501339
5
+ Create Date: 2024-10-28 13:03:53.442309
6
 
7
  """
8
  from typing import Sequence, Union
 
12
 
13
 
14
  # revision identifiers, used by Alembic.
15
+ revision: str = '3292a4761b48'
16
  down_revision: Union[str, None] = '8652e8501339'
17
  branch_labels: Union[str, Sequence[str], None] = None
18
  depends_on: Union[str, Sequence[str], None] = None
 
20
 
21
  def upgrade() -> None:
22
  # ### commands auto generated by Alembic - please adjust! ###
23
+ op.create_table('files',
24
+ sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
25
+ sa.Column('file_name', sa.String(), nullable=False),
26
+ sa.Column('uploaded_at', sa.DateTime(), nullable=False),
27
+ sa.Column('processed_at', sa.DateTime(), nullable=True),
28
+ sa.Column('slack_file_id', sa.String(), nullable=True),
29
+ sa.Column('user_id', sa.Integer(), nullable=False),
30
+ sa.Column('mime_type', sa.String(), nullable=False),
31
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
32
+ sa.PrimaryKeyConstraint('id')
33
+ )
34
+ op.create_table('vector_records',
35
+ sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
36
+ sa.Column('file_id', sa.Integer(), nullable=False),
37
+ sa.Column('record_id', sa.Integer(), nullable=False),
38
+ sa.ForeignKeyConstraint(['file_id'], ['files.id'], ),
39
+ sa.PrimaryKeyConstraint('id')
40
+ )
41
  # ### end Alembic commands ###
42
 
43
 
44
  def downgrade() -> None:
45
  # ### commands auto generated by Alembic - please adjust! ###
46
+ op.drop_table('vector_records')
47
+ op.drop_table('files')
48
  # ### end Alembic commands ###
migrations/versions/37cddaae80e7_create_tables.py DELETED
@@ -1,30 +0,0 @@
1
- """create_tables
2
-
3
- Revision ID: 37cddaae80e7
4
- Revises: c649dcba2de3
5
- Create Date: 2024-10-28 11:29:18.878826
6
-
7
- """
8
- from typing import Sequence, Union
9
-
10
- from alembic import op
11
- import sqlalchemy as sa
12
-
13
-
14
- # revision identifiers, used by Alembic.
15
- revision: str = '37cddaae80e7'
16
- down_revision: Union[str, None] = 'c649dcba2de3'
17
- branch_labels: Union[str, Sequence[str], None] = None
18
- depends_on: Union[str, Sequence[str], None] = None
19
-
20
-
21
- def upgrade() -> None:
22
- # ### commands auto generated by Alembic - please adjust! ###
23
- pass
24
- # ### end Alembic commands ###
25
-
26
-
27
- def downgrade() -> None:
28
- # ### commands auto generated by Alembic - please adjust! ###
29
- pass
30
- # ### end Alembic commands ###
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/controllers/_bot_controller.py CHANGED
@@ -7,7 +7,7 @@ from src.utils import SlackClient
7
  from src.services import SlackBotService, SlackTeamService, UserService
8
  from src.models import ContentDeliveryFrequency
9
  from src.utils import logger
10
-
11
 
12
  class BotController:
13
  def __init__(self):
@@ -23,23 +23,68 @@ class BotController:
23
  try:
24
  if "challenge" in event:
25
  return {"challenge": event["challenge"]}
 
26
  if "event" in event:
27
  actual_event = event["event"]
28
  logger.info(f"Actual event: {actual_event}")
 
29
  if "file" in actual_event:
30
  file = actual_event["file"]
31
  file_id = file["id"]
32
  logger.info(f"file id is {file_id}")
 
33
  async with SlackClient() as slack_client:
 
34
  file_info = await slack_client.get_file_info(file_id)
35
  logger.info(f"File info: {file_info}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  users = await slack_client.get_users_list()
37
  logger.info(f"Users: {users}")
38
  else:
39
  logger.info("file not shared")
40
  else:
41
  logger.info("event not found")
 
42
  return {"message": "Event received successfully"}
 
43
  except HTTPException as e:
44
  logger.warning(e)
45
  raise e
 
7
  from src.services import SlackBotService, SlackTeamService, UserService
8
  from src.models import ContentDeliveryFrequency
9
  from src.utils import logger
10
+ from src.services import FileService
11
 
12
  class BotController:
13
  def __init__(self):
 
23
  try:
24
  if "challenge" in event:
25
  return {"challenge": event["challenge"]}
26
+
27
  if "event" in event:
28
  actual_event = event["event"]
29
  logger.info(f"Actual event: {actual_event}")
30
+
31
  if "file" in actual_event:
32
  file = actual_event["file"]
33
  file_id = file["id"]
34
  logger.info(f"file id is {file_id}")
35
+
36
  async with SlackClient() as slack_client:
37
+ # Get file info
38
  file_info = await slack_client.get_file_info(file_id)
39
  logger.info(f"File info: {file_info}")
40
+
41
+ # Get file URL
42
+ file_url = file_info.get("url_private")
43
+ file_name = file_info.get("name", "downloaded_file")
44
+
45
+ if file_url:
46
+ # Download the file content
47
+ headers = {"Authorization": f"Bearer {slack_client.token}"}
48
+ async with aiohttp.ClientSession() as session:
49
+ async with session.get(file_url, headers=headers) as response:
50
+ if response.status == 200:
51
+ content = await response.read()
52
+ # Create UploadFile object from the content
53
+ upload_file = UploadFile(
54
+ filename=file_name,
55
+ file=BytesIO(content)
56
+ )
57
+
58
+ # Process the file
59
+ async with self.user_repository as user_repo:
60
+ user = await user_repo.get_user_by_slack_id(user_slack_id)
61
+ if not user:
62
+ logger.error(f"User not found for slack_id: {user_slack_id}")
63
+ return {"message": "User not found"}
64
+
65
+ async with self.file_service as service:
66
+ result = await service.parse_document(
67
+ file=upload_file,
68
+ user_id=user.id,
69
+ slack_file_id=slack_file_id
70
+ )
71
+ logger.info(f"File processing result: {result}")
72
+ else:
73
+ logger.error(f"Failed to download file. Status: {response.status}")
74
+ raise HTTPException(
75
+ status_code=response.status,
76
+ detail="Failed to download file from Slack"
77
+ )
78
+
79
  users = await slack_client.get_users_list()
80
  logger.info(f"Users: {users}")
81
  else:
82
  logger.info("file not shared")
83
  else:
84
  logger.info("event not found")
85
+
86
  return {"message": "Event received successfully"}
87
+
88
  except HTTPException as e:
89
  logger.warning(e)
90
  raise e
src/models/__init__.py CHANGED
@@ -1,7 +1,9 @@
1
  from ._base import Base
2
  from ._user import User, ContentDeliveryFrequency
3
  from ._slack_team import SlackTeam
 
 
4
 
5
- __all__ = ["Base", "User", "ContentDeliveryFrequency", "SlackTeam"]
6
  __version__ = "0.1.0"
7
  __author__ = "Kanha Upadhyay"
 
1
  from ._base import Base
2
  from ._user import User, ContentDeliveryFrequency
3
  from ._slack_team import SlackTeam
4
+ from ._files import File
5
+ from ._vector_records import VectorRecord
6
 
7
+ __all__ = ["Base", "User", "ContentDeliveryFrequency", "SlackTeam", "File", "VectorRecord"]
8
  __version__ = "0.1.0"
9
  __author__ = "Kanha Upadhyay"
src/repositories/__init__.py CHANGED
@@ -2,7 +2,9 @@ from ._base_repository import BaseRepository
2
  from ._config import DatabaseConfig
3
  from ._user_repository import UserRepository
4
  from ._slack_team_repository import SlackTeamRepository
 
 
5
 
6
- __all__ = ["DatabaseConfig", "BaseRepository", "UserRepository", "SlackTeamRepository"]
7
  __version__ = "0.1.0"
8
  __author__ = "Kanha Upadhyay"
 
2
  from ._config import DatabaseConfig
3
  from ._user_repository import UserRepository
4
  from ._slack_team_repository import SlackTeamRepository
5
+ from ._file_repository import FileRepository
6
+ from ._vector_record_repository import VectorRecordRepository
7
 
8
+ __all__ = ["DatabaseConfig", "BaseRepository", "UserRepository", "SlackTeamRepository", "FileRepository", "VectorRecordRepository"]
9
  __version__ = "0.1.0"
10
  __author__ = "Kanha Upadhyay"
src/repositories/_file_repository.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import select
2
+ from datetime import datetime
3
+ from src.models import File
4
+ from ._base_repository import BaseRepository
5
+
6
+ class FileRepository(BaseRepository):
7
+ def __init__(self):
8
+ super().__init__(model=File)
9
+
10
+ async def create_file(self, file_name: str, user_id: int, mime_type: str, slack_file_id: str = None) -> File:
11
+ """Create a new file record"""
12
+ file = File(
13
+ file_name=file_name,
14
+ user_id=user_id,
15
+ mime_type=mime_type,
16
+ slack_file_id=slack_file_id,
17
+ uploaded_at=datetime.utcnow()
18
+ )
19
+ return await self.add(file)
20
+
21
+ async def mark_as_processed(self, file_id: int) -> None:
22
+ """Mark file as processed"""
23
+ file = await self.get_by_id(file_id)
24
+ if file:
25
+ file.processed_at = datetime.utcnow()
26
+ await self.update(file)
27
+
28
+ async def get_by_slack_file_id(self, slack_file_id: str) -> File:
29
+ """Get file by Slack file ID"""
30
+ query = select(File).where(File.slack_file_id == slack_file_id)
31
+ result = await self.execute(query)
32
+ return result.scalar_one_or_none()
src/repositories/_vector_record_repository.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.models import VectorRecord
2
+ from ._base_repository import BaseRepository
3
+ from typing import List
4
+
5
+ class VectorRecordRepository(BaseRepository):
6
+ def __init__(self):
7
+ super().__init__(model=VectorRecord)
8
+
9
+ async def create_records(self, file_id: int, record_ids: List[int]) -> List[VectorRecord]:
10
+ """Create multiple vector records for a file"""
11
+ records = [
12
+ VectorRecord(file_id=file_id, record_id=record_id)
13
+ for record_id in record_ids
14
+ ]
15
+ return await self.add_all(records)
16
+
17
+ async def get_by_file_id(self, file_id: int) -> List[VectorRecord]:
18
+ """Get all vector records for a file"""
19
+ query = select(VectorRecord).where(VectorRecord.file_id == file_id)
20
+ result = await self.execute(query)
21
+ return result.scalars().all()
src/services/__init__.py CHANGED
@@ -4,6 +4,6 @@ from ._slack_bot_service import SlackBotService
4
  from ._slack_team_service import SlackTeamService
5
  from ._file_service import FileService
6
 
7
- __all__ = ["ContentDeliveryService", "UserService", "SlackBotService", "SlackTeamService","FileService"]
8
  __version__ = "0.1.0"
9
  __author__ = "Ashok Bhati"
 
4
  from ._slack_team_service import SlackTeamService
5
  from ._file_service import FileService
6
 
7
+ __all__ = ["ContentDeliveryService", "UserService", "SlackBotService", "SlackTeamService", "FileService"]
8
  __version__ = "0.1.0"
9
  __author__ = "Ashok Bhati"
src/services/_file_service.py CHANGED
@@ -36,18 +36,75 @@ class FileService:
36
  shutil.copyfileobj(file.file, buffer)
37
  return file_path
38
 
39
- async def parse_document(self, file: UploadFile) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  try:
41
  file_path = await self.save_uploaded_file(file)
42
 
 
 
 
 
 
 
 
 
 
43
  parsed_content = await parse_file_content(str(file_path))
 
 
 
 
44
 
45
  documents = create_documents(parsed_content, file.filename)
46
 
47
- await store_documents(documents)
 
 
 
 
 
 
 
 
48
 
49
  return {
50
  "message": "Document's embeddings created successfully",
 
51
  }
52
  except Exception as e:
53
  logger.error(f"Error in parse_document: {str(e)}")
 
36
  shutil.copyfileobj(file.file, buffer)
37
  return file_path
38
 
39
+ async def filter_ppt_content(self, parsed_content: Tuple[Tuple[int, str], ...]) -> Tuple[Tuple[int, str], ...]:
40
+ """
41
+ Filter PowerPoint content based on specific target phrases.
42
+
43
+ Args:
44
+ parsed_content: Tuple of (page_number, content) pairs
45
+
46
+ Returns:
47
+ Filtered tuple of (page_number, content) pairs
48
+ """
49
+ target_phrases = [
50
+ "My Strategy for Managing Myself",
51
+ "My Strategy For Active Listening",
52
+ "My Strategy for Managing Others",
53
+ "What It Concretely Looks Like",
54
+ "My People",
55
+ "My Philosophy",
56
+ "My Preferences",
57
+ "My Priorities for Q1 2025"
58
+ ]
59
+
60
+ filtered_content = []
61
+
62
+ for page_num, content in parsed_content:
63
+ content_lower = content.lower()
64
+ for phrase in target_phrases:
65
+ if phrase.lower() in content_lower:
66
+ filtered_content.append((page_num, content))
67
+ break # Move to next page once we find a matching phrase
68
+
69
+ logger.info(f"Total pages before filtering: {len(parsed_content)}")
70
+ logger.info(f"Total pages after filtering: {len(filtered_content)}")
71
+
72
+ return tuple(filtered_content)
73
+
74
+ async def parse_document(self, file: UploadFile, user_id: int, slack_file_id: str = None) -> Dict:
75
  try:
76
  file_path = await self.save_uploaded_file(file)
77
 
78
+ # Create file record in database
79
+ mime_type = file.content_type or f"application/{file.filename.split('.')[-1]}"
80
+ db_file = await self.file_repository.create_file(
81
+ file_name=file.filename,
82
+ user_id=user_id,
83
+ mime_type=mime_type,
84
+ slack_file_id=slack_file_id
85
+ )
86
+
87
  parsed_content = await parse_file_content(str(file_path))
88
+
89
+ file_extension = file.filename.lower().split('.')[-1]
90
+ if file_extension in ['ppt', 'pptx']:
91
+ parsed_content = await self.filter_ppt_content(parsed_content)
92
 
93
  documents = create_documents(parsed_content, file.filename)
94
 
95
+ # Store documents in vector store and get record IDs
96
+ record_ids = await store_documents(documents)
97
+
98
+ # Store vector record mappings in database
99
+ if record_ids:
100
+ await self.vector_repository.create_records(db_file.id, record_ids)
101
+
102
+ # Mark file as processed
103
+ await self.file_repository.mark_as_processed(db_file.id)
104
 
105
  return {
106
  "message": "Document's embeddings created successfully",
107
+ "file_id": db_file.id
108
  }
109
  except Exception as e:
110
  logger.error(f"Error in parse_document: {str(e)}")