Gaykar commited on
Commit
f3b2b2d
·
1 Parent(s): 621eb6f
Files changed (34) hide show
  1. app/ai_agents/__init__.py +0 -0
  2. agents.py → app/ai_agents/agents.py +2 -2
  3. app/core/__init__.py +0 -0
  4. config.py → app/core/config.py +9 -17
  5. app/database/__init__.py +0 -0
  6. database.py → app/database/connection.py +7 -3
  7. graph.py → app/graph.py +2 -2
  8. graph_trial.py → app/graph_trial.py +1 -1
  9. app.py → app/main.py +2 -3
  10. nodes.py → app/nodes/nodes.py +5 -5
  11. schemas.py → app/schemas/schemas.py +0 -0
  12. state.py → app/state/state.py +0 -0
  13. utils.py → app/utils/utils.py +1 -1
  14. {ComplaintData → app/vectordatabase/ComplaintData}/buildingsPlanning.json +0 -0
  15. {ComplaintData → app/vectordatabase/ComplaintData}/buildingsPlanning_langchain_formatted.json +0 -0
  16. {ComplaintData → app/vectordatabase/ComplaintData}/complaint_matching_data.json +0 -0
  17. {ComplaintData → app/vectordatabase/ComplaintData}/electricity.json +0 -0
  18. {ComplaintData → app/vectordatabase/ComplaintData}/electricity_langchain_formatted.json +0 -0
  19. {ComplaintData → app/vectordatabase/ComplaintData}/parkRecreation.json +0 -0
  20. {ComplaintData → app/vectordatabase/ComplaintData}/parkRecreation_langchain_formatted.json +0 -0
  21. {ComplaintData → app/vectordatabase/ComplaintData}/publicHealth.json +0 -0
  22. {ComplaintData → app/vectordatabase/ComplaintData}/publicHealth_langchain_formatted.json +0 -0
  23. {ComplaintData → app/vectordatabase/ComplaintData}/publicSafety.json +0 -0
  24. {ComplaintData → app/vectordatabase/ComplaintData}/publicSafety_langchain_formatted.json +0 -0
  25. {ComplaintData → app/vectordatabase/ComplaintData}/roadsInfrastructure.json +0 -0
  26. {ComplaintData → app/vectordatabase/ComplaintData}/roadsInfrastructure_langchain_formatted.json +0 -0
  27. {ComplaintData → app/vectordatabase/ComplaintData}/sanitationCleanliness.json +0 -0
  28. {ComplaintData → app/vectordatabase/ComplaintData}/sanitationCleanliness_langchain_formatted.json +0 -0
  29. {ComplaintData → app/vectordatabase/ComplaintData}/waterSewage.json +0 -0
  30. {ComplaintData → app/vectordatabase/ComplaintData}/waterSewage_langchain_formatted.json +0 -0
  31. app/vectordatabase/__init__.py +0 -0
  32. app/vectordatabase/matching_data_bm25.pkl +3 -0
  33. vectordatabase.py → app/vectordatabase/pinecone.py +31 -30
  34. app/vectordatabase/priority_bm25.pkl +3 -0
app/ai_agents/__init__.py ADDED
File without changes
agents.py → app/ai_agents/agents.py RENAMED
@@ -1,6 +1,6 @@
1
  from langchain_groq import ChatGroq
2
- from config import settings
3
- from schemas import ComplaintClassificationResponse
4
  import os
5
 
6
 
 
1
  from langchain_groq import ChatGroq
2
+ from app.core.config import settings
3
+ from app.schemas.schemas import ComplaintClassificationResponse
4
  import os
5
 
6
 
app/core/__init__.py ADDED
File without changes
config.py → app/core/config.py RENAMED
@@ -1,28 +1,20 @@
1
- import os
2
  from pathlib import Path
3
- from typing import Optional
4
  from pydantic_settings import BaseSettings, SettingsConfigDict
5
-
6
- # Since your .env is in the root (based on your folder structure image)
7
- BASE_DIR = Path(__file__).resolve().parent
8
-
9
  class Settings(BaseSettings):
10
- # Project Metadata
11
- PROJECT_NAME: str
12
-
13
- # API Keys & Secrets
14
  PINECONE_API_KEY: str
15
- PINECONE_ENVIRONMENT: str
16
-
17
- # Database Configuration
18
- DATABASE_URL: str
19
 
20
- # Pydantic Settings Config
21
  model_config = SettingsConfigDict(
22
  env_file=str(BASE_DIR / ".env"),
23
  env_file_encoding="utf-8",
24
  extra="ignore"
25
  )
26
-
27
- # Singleton instance to be imported across the project
28
  settings = Settings()
 
 
1
  from pathlib import Path
 
2
  from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+ BASE_DIR = Path(__file__).resolve().parent.parent.parent
5
+
 
6
  class Settings(BaseSettings):
7
+ PROJECT_NAME: str = "City AI Sync"
8
+
9
+ GROQ_API_KEY: str
 
10
  PINECONE_API_KEY: str
11
+ DATABASE_URL: str
 
 
 
12
 
13
+
14
  model_config = SettingsConfigDict(
15
  env_file=str(BASE_DIR / ".env"),
16
  env_file_encoding="utf-8",
17
  extra="ignore"
18
  )
19
+
 
20
  settings = Settings()
app/database/__init__.py ADDED
File without changes
database.py → app/database/connection.py RENAMED
@@ -2,13 +2,17 @@ from sqlalchemy.orm import sessionmaker, Session
2
  from sqlalchemy import create_engine, Column, Integer, String, Float, ForeignKey
3
  from sqlalchemy.orm import declarative_base, relationship, sessionmaker
4
  import os
5
- from dotenv import load_dotenv
 
 
 
 
 
6
 
7
  # This loads the variables from your .env file into the system environment
8
- load_dotenv()
9
 
10
  # Access them using os.getenv()
11
- DB_URL= os.getenv("DB_URL")
12
 
13
  Base = declarative_base()
14
 
 
2
  from sqlalchemy import create_engine, Column, Integer, String, Float, ForeignKey
3
  from sqlalchemy.orm import declarative_base, relationship, sessionmaker
4
  import os
5
+ from app.core.config import settings
6
+
7
+ DB_URL=settings.DATABASE_URL
8
+
9
+
10
+
11
 
12
  # This loads the variables from your .env file into the system environment
 
13
 
14
  # Access them using os.getenv()
15
+
16
 
17
  Base = declarative_base()
18
 
graph.py → app/graph.py RENAMED
@@ -1,6 +1,6 @@
1
- from state import ComplaintState
2
  from langgraph.graph import StateGraph, START, END
3
- from nodes import classify_complaint_node, group_duplicate_complaints_node, calculate_priority_node, store_data_node, router
4
  workflow = StateGraph(ComplaintState)
5
 
6
  # Add Nodes
 
1
+ from app.state.state import ComplaintState
2
  from langgraph.graph import StateGraph, START, END
3
+ from app.nodes.nodes import classify_complaint_node, group_duplicate_complaints_node, calculate_priority_node, store_data_node, router
4
  workflow = StateGraph(ComplaintState)
5
 
6
  # Add Nodes
graph_trial.py → app/graph_trial.py RENAMED
@@ -1,4 +1,4 @@
1
- from graph import graph
2
 
3
  from langgraph.checkpoint.memory import MemorySaver
4
  checkpointer = MemorySaver()
 
1
+ from app.graph import graph
2
 
3
  from langgraph.checkpoint.memory import MemorySaver
4
  checkpointer = MemorySaver()
app.py → app/main.py RENAMED
@@ -2,10 +2,9 @@ import os
2
  from fastapi import FastAPI, HTTPException, Depends
3
  from pydantic import BaseModel, Field
4
  from typing import Optional
5
-
6
  # Importing internal modules
7
- from config import settings
8
- from graph import graph # The compiled LangGraph instance
9
 
10
  app = FastAPI(title=settings.PROJECT_NAME)
11
 
 
2
  from fastapi import FastAPI, HTTPException, Depends
3
  from pydantic import BaseModel, Field
4
  from typing import Optional
 
5
  # Importing internal modules
6
+ from app.core.config import settings
7
+ from app.graph import graph # The compiled LangGraph instance
8
 
9
  app = FastAPI(title=settings.PROJECT_NAME)
10
 
nodes.py → app/nodes/nodes.py RENAMED
@@ -1,10 +1,10 @@
1
- from state import ComplaintState
2
- from database import Complaint, ComplaintUser, get_session
3
  from typing import Literal
4
  from sqlalchemy.orm import Session
5
- from agents import complaint_classifier_agent
6
- from vectordatabase import matching_retriever,retriever
7
- from utils import *
8
 
9
  # Assuming these are available in your global environment or config
10
  # from config import matching_retriever, get_session
 
1
+ from app.state.state import ComplaintState
2
+ from app.database.connection import Complaint, ComplaintUser, get_session
3
  from typing import Literal
4
  from sqlalchemy.orm import Session
5
+ from app.vectordatabase.pinecone import matching_retriever,retriever
6
+ from app.ai_agents.agents import complaint_classifier_agent
7
+ from app.utils.utils import *
8
 
9
  # Assuming these are available in your global environment or config
10
  # from config import matching_retriever, get_session
schemas.py → app/schemas/schemas.py RENAMED
File without changes
state.py → app/state/state.py RENAMED
File without changes
utils.py → app/utils/utils.py RENAMED
@@ -1,5 +1,5 @@
1
  from sqlalchemy import and_
2
- from database import Complaint, ComplaintUser
3
  import numpy as np
4
 
5
 
 
1
  from sqlalchemy import and_
2
+ from app.database.connection import Complaint, ComplaintUser
3
  import numpy as np
4
 
5
 
{ComplaintData → app/vectordatabase/ComplaintData}/buildingsPlanning.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/buildingsPlanning_langchain_formatted.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/complaint_matching_data.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/electricity.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/electricity_langchain_formatted.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/parkRecreation.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/parkRecreation_langchain_formatted.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/publicHealth.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/publicHealth_langchain_formatted.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/publicSafety.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/publicSafety_langchain_formatted.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/roadsInfrastructure.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/roadsInfrastructure_langchain_formatted.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/sanitationCleanliness.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/sanitationCleanliness_langchain_formatted.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/waterSewage.json RENAMED
File without changes
{ComplaintData → app/vectordatabase/ComplaintData}/waterSewage_langchain_formatted.json RENAMED
File without changes
app/vectordatabase/__init__.py ADDED
File without changes
app/vectordatabase/matching_data_bm25.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e110dd07ebf397e44dc33e801131b710ad67fab368e68373f0ad4e925eed7a70
3
+ size 3129
vectordatabase.py → app/vectordatabase/pinecone.py RENAMED
@@ -6,19 +6,29 @@ from typing import List
6
  from pathlib import Path
7
  from pinecone import Pinecone, ServerlessSpec
8
  from pinecone_text.sparse import BM25Encoder
9
- from langchain_huggingface import HuggingFaceEmbeddings
10
  from langchain_community.retrievers import PineconeHybridSearchRetriever
11
  from langchain_core.documents import Document
12
  from langchain_core.embeddings import Embeddings
 
13
 
14
- # 1. Environment & API Setup
15
- PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
 
 
 
 
 
 
 
 
 
 
16
  if not PINECONE_API_KEY:
17
- raise ValueError("PINECONE_API_KEY not found. Set it in HF Space Secrets.")
18
 
19
  pc = Pinecone(api_key=PINECONE_API_KEY)
20
 
21
- # 2. Remote Embedding Configuration
22
  class GeneralRemoteEmbeddings(Embeddings):
23
  def __init__(self, endpoint: str):
24
  self.endpoint = endpoint
@@ -35,7 +45,7 @@ class GeneralRemoteEmbeddings(Embeddings):
35
 
36
  embeddings = GeneralRemoteEmbeddings(endpoint="https://gaykar-generalembeddings.hf.space")
37
 
38
- # 3. Index Initialization Helper
39
  def get_or_create_index(name: str):
40
  if name not in pc.list_indexes().names():
41
  pc.create_index(
@@ -46,17 +56,13 @@ def get_or_create_index(name: str):
46
  )
47
  return pc.Index(name)
48
 
49
- # Initialize both indices
50
  index_general = get_or_create_index("complaints-index")
51
  index_matching = get_or_create_index("user-complaint-matching-index")
52
 
53
- # 4. Data Loading (Linux Compatible Paths)
54
- BASE_DATA_DIR = Path("ComplaintData")
55
- PRIORITY_BM25_PKL = Path("priority_bm25.pkl")
56
- MATCHING_BM25_PKL = Path("matching_data_bm25.pkl")
57
-
58
  def load_docs_from_json(pattern: str):
59
  docs = []
 
60
  for file_path in BASE_DATA_DIR.glob(pattern):
61
  with open(file_path, "r", encoding="utf-8") as f:
62
  try:
@@ -70,21 +76,19 @@ def load_docs_from_json(pattern: str):
70
  print(f"Error loading {file_path}: {e}")
71
  return docs
72
 
73
- # --- 5. BM25 & Retriever Setup for Priority Scoring ---
74
  general_docs = load_docs_from_json("*_langchain_formatted.json")
75
  bm25_general = BM25Encoder()
76
 
77
  if PRIORITY_BM25_PKL.exists():
78
- print("Loading existing Priority BM25 model from pickle...")
79
  with open(PRIORITY_BM25_PKL, "rb") as f:
80
  bm25_general = pickle.load(f)
81
  else:
82
- if general_docs:
83
- print("Fitting Priority BM25 on general knowledge base...")
84
- bm25_general.fit([doc.page_content for doc in general_docs])
85
- with open(PRIORITY_BM25_PKL, "wb") as f:
86
- pickle.dump(bm25_general, f)
87
- print(f"Priority BM25 fitted and saved to {PRIORITY_BM25_PKL}")
88
 
89
  retriever = PineconeHybridSearchRetriever(
90
  embeddings=embeddings,
@@ -93,21 +97,19 @@ retriever = PineconeHybridSearchRetriever(
93
  alpha=0.85
94
  )
95
 
96
- # --- 6. BM25 & Retriever Setup for Duplicate Matching ---
97
  matching_docs = load_docs_from_json("complaint_matching_data.json")
98
  bm25_matching = BM25Encoder()
99
 
100
  if MATCHING_BM25_PKL.exists():
101
- print("Loading existing Matching BM25 model from pickle...")
102
  with open(MATCHING_BM25_PKL, "rb") as f:
103
  bm25_matching = pickle.load(f)
104
  else:
105
- if matching_docs:
106
- print("Fitting Matching BM25 on complaint matching data...")
107
- bm25_matching.fit([doc.page_content for doc in matching_docs])
108
- with open(MATCHING_BM25_PKL, "wb") as f:
109
- pickle.dump(bm25_matching, f)
110
- print(f"Matching BM25 fitted and saved to {MATCHING_BM25_PKL}")
111
 
112
  matching_retriever = PineconeHybridSearchRetriever(
113
  embeddings=embeddings,
@@ -115,5 +117,4 @@ matching_retriever = PineconeHybridSearchRetriever(
115
  index=index_matching,
116
  top_k=1,
117
  alpha=0.9
118
- )
119
-
 
6
  from pathlib import Path
7
  from pinecone import Pinecone, ServerlessSpec
8
  from pinecone_text.sparse import BM25Encoder
 
9
  from langchain_community.retrievers import PineconeHybridSearchRetriever
10
  from langchain_core.documents import Document
11
  from langchain_core.embeddings import Embeddings
12
+ from app.core.config import settings
13
 
14
+ # 1. Path Resolution (Fixes the folder structure issue)
15
+ # This finds the absolute path to the directory containing this file
16
+ current_file_path = Path(__file__).resolve()
17
+ VDB_DIR = current_file_path.parent
18
+ BASE_DATA_DIR = VDB_DIR / "ComplaintData"
19
+
20
+ # Pickle files will now be stored inside the vectordatabase folder too
21
+ PRIORITY_BM25_PKL = VDB_DIR / "priority_bm25.pkl"
22
+ MATCHING_BM25_PKL = VDB_DIR / "matching_data_bm25.pkl"
23
+
24
+ # 2. Environment & API Setup
25
+ PINECONE_API_KEY = settings.PINECONE_API_KEY
26
  if not PINECONE_API_KEY:
27
+ raise ValueError("PINECONE_API_KEY not found in settings.")
28
 
29
  pc = Pinecone(api_key=PINECONE_API_KEY)
30
 
31
+ # 3. Remote Embedding Configuration
32
  class GeneralRemoteEmbeddings(Embeddings):
33
  def __init__(self, endpoint: str):
34
  self.endpoint = endpoint
 
45
 
46
  embeddings = GeneralRemoteEmbeddings(endpoint="https://gaykar-generalembeddings.hf.space")
47
 
48
+ # 4. Index Initialization Helper
49
  def get_or_create_index(name: str):
50
  if name not in pc.list_indexes().names():
51
  pc.create_index(
 
56
  )
57
  return pc.Index(name)
58
 
 
59
  index_general = get_or_create_index("complaints-index")
60
  index_matching = get_or_create_index("user-complaint-matching-index")
61
 
62
+ # 5. Data Loading Logic
 
 
 
 
63
  def load_docs_from_json(pattern: str):
64
  docs = []
65
+ # Search inside the absolute path resolved in step 1
66
  for file_path in BASE_DATA_DIR.glob(pattern):
67
  with open(file_path, "r", encoding="utf-8") as f:
68
  try:
 
76
  print(f"Error loading {file_path}: {e}")
77
  return docs
78
 
79
+ # --- 6. BM25 & Retriever Setup for Priority Scoring ---
80
  general_docs = load_docs_from_json("*_langchain_formatted.json")
81
  bm25_general = BM25Encoder()
82
 
83
  if PRIORITY_BM25_PKL.exists():
 
84
  with open(PRIORITY_BM25_PKL, "rb") as f:
85
  bm25_general = pickle.load(f)
86
  else:
87
+ # IMPORTANT: Always fit on at least one string to prevent "not fit" error
88
+ texts = [doc.page_content for doc in general_docs] if general_docs else ["seed text for priority"]
89
+ bm25_general.fit(texts)
90
+ with open(PRIORITY_BM25_PKL, "wb") as f:
91
+ pickle.dump(bm25_general, f)
 
92
 
93
  retriever = PineconeHybridSearchRetriever(
94
  embeddings=embeddings,
 
97
  alpha=0.85
98
  )
99
 
100
+ # --- 7. BM25 & Retriever Setup for Duplicate Matching ---
101
  matching_docs = load_docs_from_json("complaint_matching_data.json")
102
  bm25_matching = BM25Encoder()
103
 
104
  if MATCHING_BM25_PKL.exists():
 
105
  with open(MATCHING_BM25_PKL, "rb") as f:
106
  bm25_matching = pickle.load(f)
107
  else:
108
+ # Safety fit for matching retriever
109
+ texts = [doc.page_content for doc in matching_docs] if matching_docs else ["seed text for matching"]
110
+ bm25_matching.fit(texts)
111
+ with open(MATCHING_BM25_PKL, "wb") as f:
112
+ pickle.dump(bm25_matching, f)
 
113
 
114
  matching_retriever = PineconeHybridSearchRetriever(
115
  embeddings=embeddings,
 
117
  index=index_matching,
118
  top_k=1,
119
  alpha=0.9
120
+ )
 
app/vectordatabase/priority_bm25.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42720b8474af5f170e289a6f49422ed74901b5546d1dd7fa6b2121277040eea7
3
+ size 14608