Spaces:
Sleeping
Sleeping
Commit ·
d51a05c
1
Parent(s): 0a0f155
oneshott primpt change
Browse files- .gitignore +1 -0
- LLM/one_shotter.py +5 -4
- RAG/rag_modules/query_expansion.py +12 -5
- config/config.py +7 -7
- preprocessing/preprocessing_modules/modular_preprocessor.py +2 -2
.gitignore
CHANGED
|
@@ -7,3 +7,4 @@ test*
|
|
| 7 |
all-MiniLM-L6-v2
|
| 8 |
cross-encoder/ms-marco-MiniLM-L-6-v2
|
| 9 |
test
|
|
|
|
|
|
| 7 |
all-MiniLM-L6-v2
|
| 8 |
cross-encoder/ms-marco-MiniLM-L-6-v2
|
| 9 |
test
|
| 10 |
+
RAG/rag_embeddings/[a-z]*
|
LLM/one_shotter.py
CHANGED
|
@@ -414,7 +414,8 @@ CRITICAL INSTRUCTIONS:
|
|
| 414 |
4. BE THOROUGH: Don't just use the original context - actively look for and incorporate information from scraped websites
|
| 415 |
5. DETAILED EXPLANATIONS: Provide comprehensive, well-structured answers with specific details
|
| 416 |
6. IF MISSING INFO: Only state information is missing if it's truly not available in ANY part of the provided context
|
| 417 |
-
|
|
|
|
| 418 |
The context may contain multiple sections:
|
| 419 |
- Original context
|
| 420 |
- Additional Information from relevant links
|
|
@@ -425,12 +426,12 @@ USE ALL OF THESE SECTIONS TO PROVIDE COMPLETE ANSWERS.
|
|
| 425 |
Respond in this EXACT JSON format:
|
| 426 |
{{
|
| 427 |
"answers": [
|
| 428 |
-
"<Correct Answer to the question 1,
|
| 429 |
-
"<Correct Answer to the question 2,
|
| 430 |
...
|
| 431 |
]
|
| 432 |
}}
|
| 433 |
-
|
| 434 |
])
|
| 435 |
|
| 436 |
questions_text = "\n".join([f"{i+1}. {q.strip()}" for i, q in enumerate(questions)])
|
|
|
|
| 414 |
4. BE THOROUGH: Don't just use the original context - actively look for and incorporate information from scraped websites
|
| 415 |
5. DETAILED EXPLANATIONS: Provide comprehensive, well-structured answers with specific details
|
| 416 |
6. IF MISSING INFO: Only state information is missing if it's truly not available in ANY part of the provided context
|
| 417 |
+
7. First give the correct answer and then explain in short, you don't need to outline your thought process.
|
| 418 |
+
8. Never make any assumptions on your own.
|
| 419 |
The context may contain multiple sections:
|
| 420 |
- Original context
|
| 421 |
- Additional Information from relevant links
|
|
|
|
| 426 |
Respond in this EXACT JSON format:
|
| 427 |
{{
|
| 428 |
"answers": [
|
| 429 |
+
"<Correct Answer to the question 1, followed by explaination.>",
|
| 430 |
+
"<Correct Answer to the question 2, followed by explaination only if question 2 exists.>",
|
| 431 |
...
|
| 432 |
]
|
| 433 |
}}
|
| 434 |
+
""")
|
| 435 |
])
|
| 436 |
|
| 437 |
questions_text = "\n".join([f"{i+1}. {q.strip()}" for i, q in enumerate(questions)])
|
RAG/rag_modules/query_expansion.py
CHANGED
|
@@ -42,9 +42,11 @@ Examples of good breakdown:
|
|
| 42 |
|
| 43 |
|
| 44 |
Provide only {QUERY_EXPANSION_COUNT} focused sub-questions, one per line, without numbering or additional formatting:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
|
| 49 |
"""
|
| 50 |
|
|
@@ -60,14 +62,19 @@ Here are the focused sub queries:
|
|
| 60 |
if response:
|
| 61 |
sub_queries = response.strip().split('\n')
|
| 62 |
for query in sub_queries:
|
| 63 |
-
if len(expanded_queries) >= QUERY_EXPANSION_COUNT: # Stop when we have enough
|
| 64 |
break
|
| 65 |
query = query.strip()
|
| 66 |
# Remove any numbering or bullet points that might be added
|
| 67 |
query = re.sub(r'^[\d\.\-\*\s]+', '', query).strip()
|
| 68 |
if query and len(query) > 10:
|
| 69 |
expanded_queries.append(query)
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
# If we don't have enough sub-queries, fall back to using the original
|
| 72 |
if len(expanded_queries) < QUERY_EXPANSION_COUNT:
|
| 73 |
expanded_queries = [original_query]
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
Provide only {QUERY_EXPANSION_COUNT} focused sub-questions, one per line, without numbering or additional formatting:
|
| 45 |
+
Example Reponse:
|
| 46 |
+
Here are the focused sub queries
|
| 47 |
+
subquery1
|
| 48 |
+
subquery2 (if exists)
|
| 49 |
+
...
|
| 50 |
|
| 51 |
"""
|
| 52 |
|
|
|
|
| 62 |
if response:
|
| 63 |
sub_queries = response.strip().split('\n')
|
| 64 |
for query in sub_queries:
|
| 65 |
+
if len(expanded_queries) >= QUERY_EXPANSION_COUNT + 1: # Stop when we have enough
|
| 66 |
break
|
| 67 |
query = query.strip()
|
| 68 |
# Remove any numbering or bullet points that might be added
|
| 69 |
query = re.sub(r'^[\d\.\-\*\s]+', '', query).strip()
|
| 70 |
if query and len(query) > 10:
|
| 71 |
expanded_queries.append(query)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
if len(expanded_queries) > 1:
|
| 75 |
+
expanded_queries.pop(0)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
# If we don't have enough sub-queries, fall back to using the original
|
| 79 |
if len(expanded_queries) < QUERY_EXPANSION_COUNT:
|
| 80 |
expanded_queries = [original_query]
|
config/config.py
CHANGED
|
@@ -32,13 +32,13 @@ GROQ_MODEL_LITE = "llama3-8b-8192"
|
|
| 32 |
BEARER_TOKEN = os.getenv("BEARER_TOKEN")
|
| 33 |
|
| 34 |
# Chunking
|
| 35 |
-
CHUNK_SIZE = 400
|
| 36 |
-
CHUNK_OVERLAP = 100
|
| 37 |
|
| 38 |
# Retrieval Settings
|
| 39 |
-
TOP_K =
|
| 40 |
SCORE_THRESHOLD = 0.3
|
| 41 |
-
RERANK_TOP_K =
|
| 42 |
BM25_WEIGHT = 0.3
|
| 43 |
SEMANTIC_WEIGHT = 0.7
|
| 44 |
|
|
@@ -49,7 +49,7 @@ ENABLE_QUERY_EXPANSION = True
|
|
| 49 |
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 50 |
QUERY_EXPANSION_COUNT = 3
|
| 51 |
SCORE_THRESHOLD = 0.3
|
| 52 |
-
MAX_CONTEXT_LENGTH = 4000
|
| 53 |
|
| 54 |
USE_TOTAL_BUDGET_APPROACH = True
|
| 55 |
|
|
@@ -110,8 +110,8 @@ def get_provider_configs():
|
|
| 110 |
configs["openai"] = [{
|
| 111 |
"name": sequence[i],
|
| 112 |
"api_key": os.getenv(f"OPENAI_API_KEY_{i}"),
|
| 113 |
-
"model": os.getenv(f"OPENAI_MODEL_{i}",
|
| 114 |
-
} for i in range(10) if os.getenv(f"
|
| 115 |
]
|
| 116 |
|
| 117 |
return configs
|
|
|
|
| 32 |
BEARER_TOKEN = os.getenv("BEARER_TOKEN")
|
| 33 |
|
| 34 |
# Chunking
|
| 35 |
+
CHUNK_SIZE = 400 * 4
|
| 36 |
+
CHUNK_OVERLAP = 100 * 4
|
| 37 |
|
| 38 |
# Retrieval Settings
|
| 39 |
+
TOP_K = 9
|
| 40 |
SCORE_THRESHOLD = 0.3
|
| 41 |
+
RERANK_TOP_K = 7 # 9*400 = 3600, < 4000, some tokens reserved for questions
|
| 42 |
BM25_WEIGHT = 0.3
|
| 43 |
SEMANTIC_WEIGHT = 0.7
|
| 44 |
|
|
|
|
| 49 |
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 50 |
QUERY_EXPANSION_COUNT = 3
|
| 51 |
SCORE_THRESHOLD = 0.3
|
| 52 |
+
MAX_CONTEXT_LENGTH = 4000*4
|
| 53 |
|
| 54 |
USE_TOTAL_BUDGET_APPROACH = True
|
| 55 |
|
|
|
|
| 110 |
configs["openai"] = [{
|
| 111 |
"name": sequence[i],
|
| 112 |
"api_key": os.getenv(f"OPENAI_API_KEY_{i}"),
|
| 113 |
+
"model": os.getenv(f"OPENAI_MODEL_{i}", DEFAULT_OPENAI_MODEL)
|
| 114 |
+
} for i in range(10) if os.getenv(f"OPENAI_MODEL_{i}", "")
|
| 115 |
]
|
| 116 |
|
| 117 |
return configs
|
preprocessing/preprocessing_modules/modular_preprocessor.py
CHANGED
|
@@ -87,7 +87,7 @@ class ModularDocumentPreprocessor:
|
|
| 87 |
Process a single document: download, extract, chunk, embed, and store.
|
| 88 |
|
| 89 |
Args:
|
| 90 |
-
document_url: URL of the
|
| 91 |
force_reprocess: If True, reprocess even if already processed
|
| 92 |
timeout: Download timeout in seconds (default: 300s/5min)
|
| 93 |
|
|
@@ -106,7 +106,7 @@ class ModularDocumentPreprocessor:
|
|
| 106 |
|
| 107 |
temp_file_path = None
|
| 108 |
try:
|
| 109 |
-
# Step 1: Download
|
| 110 |
temp_file_path, ext = await self.file_downloader.download_file(document_url, timeout=timeout)
|
| 111 |
|
| 112 |
if temp_file_path == 'not supported':
|
|
|
|
| 87 |
Process a single document: download, extract, chunk, embed, and store.
|
| 88 |
|
| 89 |
Args:
|
| 90 |
+
document_url: URL of the document
|
| 91 |
force_reprocess: If True, reprocess even if already processed
|
| 92 |
timeout: Download timeout in seconds (default: 300s/5min)
|
| 93 |
|
|
|
|
| 106 |
|
| 107 |
temp_file_path = None
|
| 108 |
try:
|
| 109 |
+
# Step 1: Download Document
|
| 110 |
temp_file_path, ext = await self.file_downloader.download_file(document_url, timeout=timeout)
|
| 111 |
|
| 112 |
if temp_file_path == 'not supported':
|