Bhanushray commited on
Commit
90af697
·
verified ·
1 Parent(s): 6d2565e

Update backend/rag_engine.py

Browse files
Files changed (1) hide show
  1. backend/rag_engine.py +140 -140
backend/rag_engine.py CHANGED
@@ -1,141 +1,141 @@
1
-
2
- import json
3
- import os
4
- from langchain_groq import ChatGroq
5
- from langchain_huggingface import HuggingFaceEmbeddings
6
- from langchain_chroma import Chroma
7
- from langchain_core.prompts import ChatPromptTemplate
8
- from dotenv import load_dotenv
9
-
10
- # LOAD ENV
11
- load_dotenv()
12
-
13
- DbPath = "./chroma_db"
14
-
15
- def GetRagResponse(UserQuery):
16
- try:
17
- # 1. SETUP DATABASE
18
- EmbedModel = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
19
- VectorDb = Chroma(persist_directory=DbPath, embedding_function=EmbedModel)
20
-
21
- # 2. RETRIEVE
22
- Retriever = VectorDb.as_retriever(search_kwargs={"k": 5})
23
- Docs = Retriever.invoke(UserQuery)
24
- ContextText = "\n\n".join([d.page_content for d in Docs])
25
-
26
- # 3. LLM
27
- Llm = ChatGroq(
28
- model="llama-3.3-70b-versatile",
29
- temperature=0,
30
- api_key=os.getenv("GROQ_API_KEY")
31
- )
32
-
33
- # 4. CHAIN
34
- SystemPrompt = "Answer based ONLY on context. If unsure, say unknown."
35
- PromptTemplate = ChatPromptTemplate.from_messages([
36
- ("system", SystemPrompt),
37
- ("human", "Context:\n{context}\n\nQuestion:\n{question}")
38
- ])
39
- Chain = PromptTemplate | Llm
40
- Response = Chain.invoke({"context": ContextText, "question": UserQuery})
41
-
42
- # 5. FORMAT SOURCES
43
- FormattedSources = []
44
- Seen = set()
45
- for d in Docs:
46
- # Safely get page number, default to '?' if missing
47
- Page = d.metadata.get("page", "?")
48
- Filename = d.metadata.get("filename", "Unknown")
49
- Key = f"{Filename}-{Page}"
50
-
51
- if Key not in Seen:
52
- FormattedSources.append({
53
- "source": Filename,
54
- "page": str(Page)
55
- })
56
- Seen.add(Key)
57
-
58
- return {"answer": Response.content, "sources": FormattedSources}
59
-
60
- except Exception as e:
61
- print(f"CHAT ERROR: {e}")
62
- return {"answer": f"System Error: {str(e)}", "sources": []}
63
-
64
- def ExtractStructure(Requirement):
65
- try:
66
- # 1. SETUP
67
- EmbedModel = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
68
- VectorDb = Chroma(persist_directory=DbPath, embedding_function=EmbedModel)
69
- # INCREASE K to find the table spread across pages
70
- Retriever = VectorDb.as_retriever(search_kwargs={"k": 15})
71
- Docs = Retriever.invoke(Requirement)
72
- ContextText = "\n\n".join([d.page_content for d in Docs])
73
-
74
- # 2. LLM
75
- Llm = ChatGroq(
76
- model="llama-3.3-70b-versatile",
77
- temperature=0,
78
- api_key=os.getenv("GROQ_API_KEY")
79
- )
80
-
81
- # 3. PROMPT
82
- Prompt = f"""
83
- Extract the "{Requirement}" from the text.
84
-
85
- Look for a table with columns like: Door #, Wall Type, Frame Type, Door Type, Height, Width, Notes.
86
-
87
- Return ONLY valid JSON.
88
- Start the response with [ and end with ].
89
- Do NOT write "Here is the JSON".
90
-
91
- Use this Schema:
92
- [
93
- {{
94
- "mark": "Door Number (e.g. 1, 2, D-101)",
95
- "frame_type": "Material (e.g. Hollow Metal, Aluminum)",
96
- "door_type": "Type (e.g. Single, Double Egress)",
97
- "size": "Height/Width info",
98
- "notes": "Any notes (e.g. AE601 TYP)"
99
- }}
100
- ]
101
-
102
- TEXT:
103
- {ContextText}
104
- """
105
-
106
- Response = Llm.invoke(Prompt)
107
- RawContent = Response.content
108
-
109
- # 4. ROBUST JSON PARSING
110
- try:
111
- Start = RawContent.find('[')
112
- End = RawContent.rfind(']') + 1
113
- if Start != -1 and End != 0:
114
- JsonStr = RawContent[Start:End]
115
- Data = json.loads(JsonStr)
116
- else:
117
- Data = json.loads(RawContent)
118
- except:
119
- print(f"JSON PARSE FAIL: {RawContent}")
120
- # If JSON fails, return empty list so UI doesn't crash
121
- Data = []
122
-
123
- # 5. FORMAT SOURCES
124
- FormattedSources = []
125
- Seen = set()
126
- for d in Docs:
127
- Page = d.metadata.get("page", "?")
128
- Filename = d.metadata.get("filename", "Unknown")
129
- Key = f"{Filename}-{Page}"
130
- if Key not in Seen:
131
- FormattedSources.append({
132
- "source": Filename,
133
- "page": str(Page)
134
- })
135
- Seen.add(Key)
136
-
137
- return {"data": Data, "sources": FormattedSources}
138
-
139
- except Exception as e:
140
- print(f"EXTRACTION ERROR: {e}")
141
  return {"data": [], "sources": []}
 
1
+
2
+ import json
3
+ import os
4
+ from langchain_groq import ChatGroq
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_chroma import Chroma
7
+ from langchain_core.prompts import ChatPromptTemplate
8
+ from dotenv import load_dotenv
9
+
10
+ # LOAD ENV
11
+ load_dotenv()
12
+
13
+ DbPath = "./chroma_db"
14
+
15
+ def GetRagResponse(UserQuery):
16
+ try:
17
+ # 1. SETUP DATABASE
18
+ EmbedModel = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
19
+ VectorDb = Chroma(persist_directory=DbPath, embedding_function=EmbedModel)
20
+
21
+ # 2. RETRIEVE
22
+ Retriever = VectorDb.as_retriever(search_kwargs={"k": 5})
23
+ Docs = Retriever.invoke(UserQuery)
24
+ ContextText = "\n\n".join([d.page_content for d in Docs])
25
+
26
+ # 3. LLM
27
+ Llm = ChatGroq(
28
+ model="llama-3.3-70b-versatile",
29
+ temperature=0,
30
+ api_key="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"
31
+ )
32
+
33
+ # 4. CHAIN
34
+ SystemPrompt = "Answer based ONLY on context. If unsure, say unknown."
35
+ PromptTemplate = ChatPromptTemplate.from_messages([
36
+ ("system", SystemPrompt),
37
+ ("human", "Context:\n{context}\n\nQuestion:\n{question}")
38
+ ])
39
+ Chain = PromptTemplate | Llm
40
+ Response = Chain.invoke({"context": ContextText, "question": UserQuery})
41
+
42
+ # 5. FORMAT SOURCES
43
+ FormattedSources = []
44
+ Seen = set()
45
+ for d in Docs:
46
+ # Safely get page number, default to '?' if missing
47
+ Page = d.metadata.get("page", "?")
48
+ Filename = d.metadata.get("filename", "Unknown")
49
+ Key = f"{Filename}-{Page}"
50
+
51
+ if Key not in Seen:
52
+ FormattedSources.append({
53
+ "source": Filename,
54
+ "page": str(Page)
55
+ })
56
+ Seen.add(Key)
57
+
58
+ return {"answer": Response.content, "sources": FormattedSources}
59
+
60
+ except Exception as e:
61
+ print(f"CHAT ERROR: {e}")
62
+ return {"answer": f"System Error: {str(e)}", "sources": []}
63
+
64
+ def ExtractStructure(Requirement):
65
+ try:
66
+ # 1. SETUP
67
+ EmbedModel = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
68
+ VectorDb = Chroma(persist_directory=DbPath, embedding_function=EmbedModel)
69
+ # INCREASE K to find the table spread across pages
70
+ Retriever = VectorDb.as_retriever(search_kwargs={"k": 15})
71
+ Docs = Retriever.invoke(Requirement)
72
+ ContextText = "\n\n".join([d.page_content for d in Docs])
73
+
74
+ # 2. LLM
75
+ Llm = ChatGroq(
76
+ model="llama-3.3-70b-versatile",
77
+ temperature=0,
78
+ api_key=os.getenv("GROQ_API_KEY")
79
+ )
80
+
81
+ # 3. PROMPT
82
+ Prompt = f"""
83
+ Extract the "{Requirement}" from the text.
84
+
85
+ Look for a table with columns like: Door #, Wall Type, Frame Type, Door Type, Height, Width, Notes.
86
+
87
+ Return ONLY valid JSON.
88
+ Start the response with [ and end with ].
89
+ Do NOT write "Here is the JSON".
90
+
91
+ Use this Schema:
92
+ [
93
+ {{
94
+ "mark": "Door Number (e.g. 1, 2, D-101)",
95
+ "frame_type": "Material (e.g. Hollow Metal, Aluminum)",
96
+ "door_type": "Type (e.g. Single, Double Egress)",
97
+ "size": "Height/Width info",
98
+ "notes": "Any notes (e.g. AE601 TYP)"
99
+ }}
100
+ ]
101
+
102
+ TEXT:
103
+ {ContextText}
104
+ """
105
+
106
+ Response = Llm.invoke(Prompt)
107
+ RawContent = Response.content
108
+
109
+ # 4. ROBUST JSON PARSING
110
+ try:
111
+ Start = RawContent.find('[')
112
+ End = RawContent.rfind(']') + 1
113
+ if Start != -1 and End != 0:
114
+ JsonStr = RawContent[Start:End]
115
+ Data = json.loads(JsonStr)
116
+ else:
117
+ Data = json.loads(RawContent)
118
+ except:
119
+ print(f"JSON PARSE FAIL: {RawContent}")
120
+ # If JSON fails, return empty list so UI doesn't crash
121
+ Data = []
122
+
123
+ # 5. FORMAT SOURCES
124
+ FormattedSources = []
125
+ Seen = set()
126
+ for d in Docs:
127
+ Page = d.metadata.get("page", "?")
128
+ Filename = d.metadata.get("filename", "Unknown")
129
+ Key = f"{Filename}-{Page}"
130
+ if Key not in Seen:
131
+ FormattedSources.append({
132
+ "source": Filename,
133
+ "page": str(Page)
134
+ })
135
+ Seen.add(Key)
136
+
137
+ return {"data": Data, "sources": FormattedSources}
138
+
139
+ except Exception as e:
140
+ print(f"EXTRACTION ERROR: {e}")
141
  return {"data": [], "sources": []}