Spaces:
Sleeping
Sleeping
Commit
·
310e8f9
1
Parent(s):
e3fee32
Update services/pipeline_generator.py
Browse files- services/pipeline_generator.py +70 -77
services/pipeline_generator.py
CHANGED
|
@@ -60,83 +60,76 @@ def generate_pipeline_bedrock(user_input: str, file_path: Optional[str] = None)
|
|
| 60 |
try:
|
| 61 |
llm = ChatBedrock(
|
| 62 |
model_id="mistral.mistral-large-2402-v1:0",
|
| 63 |
-
region_name=os.getenv("AWS_REGION", "
|
| 64 |
)
|
| 65 |
|
| 66 |
prompt = ChatPromptTemplate.from_messages([
|
| 67 |
-
("system", """You are a document processing pipeline
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
- start_page
|
| 77 |
-
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
-
|
| 82 |
-
|
|
|
|
|
|
|
| 83 |
- params: {{"detail_level": "low"|"medium"|"high"}}
|
| 84 |
|
| 85 |
-
4.
|
| 86 |
-
-
|
| 87 |
-
- params: {{"max_length":
|
| 88 |
|
| 89 |
-
5.
|
| 90 |
-
-
|
| 91 |
-
- params: {{"categories": list
|
| 92 |
|
| 93 |
-
6.
|
| 94 |
-
-
|
| 95 |
-
- params: {{"entity_types":
|
| 96 |
|
| 97 |
-
7.
|
| 98 |
-
-
|
| 99 |
-
- params: {{"target_lang":
|
| 100 |
|
| 101 |
-
8. signature_verification
|
| 102 |
-
- start_page
|
| 103 |
-
- end_page (int): Ending page number
|
| 104 |
- params: {{}}
|
| 105 |
|
| 106 |
-
9. stamp_detection
|
| 107 |
-
- start_page
|
| 108 |
-
- end_page (int): Ending page number
|
| 109 |
- params: {{}}
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
{{
|
| 113 |
"pipeline_name": "descriptive-name",
|
| 114 |
"components": [
|
| 115 |
-
{{
|
| 116 |
-
|
| 117 |
-
"start_page": 1,
|
| 118 |
-
"end_page": 5,
|
| 119 |
-
"params": {{"encoding": "utf-8"}}
|
| 120 |
-
}},
|
| 121 |
-
{{
|
| 122 |
-
"tool_name": "summarize_text",
|
| 123 |
-
"start_page": 1,
|
| 124 |
-
"end_page": 1,
|
| 125 |
-
"params": {{"max_length": 500}}
|
| 126 |
-
}}
|
| 127 |
],
|
| 128 |
"target_lang": null,
|
| 129 |
-
"reason": "Brief explanation
|
| 130 |
-
"metadata": {{
|
| 131 |
-
"estimated_duration_seconds": 30
|
| 132 |
-
}}
|
| 133 |
}}
|
| 134 |
|
| 135 |
-
|
| 136 |
-
- For text processing tools (summarize, classify, NER, translate): start_page=1, end_page=1
|
| 137 |
-
- For document extraction tools: use actual page ranges from user request
|
| 138 |
-
- Components execute in ORDER - ensure dependencies are met
|
| 139 |
-
- Always include "reason" explaining the pipeline choice"""),
|
| 140 |
("human", "User request: {input}\n\nFile: {file_path}")
|
| 141 |
])
|
| 142 |
|
|
@@ -193,37 +186,37 @@ def generate_pipeline_gemini(user_input: str, file_path: Optional[str] = None) -
|
|
| 193 |
if not GEMINI_API_KEY:
|
| 194 |
raise RuntimeError("Gemini API key not configured")
|
| 195 |
|
| 196 |
-
prompt = f"""You are
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
- extract_text: start_page, end_page
|
| 200 |
-
- extract_tables: start_page, end_page
|
| 201 |
-
- describe_images: start_page, end_page
|
| 202 |
-
-
|
| 203 |
-
-
|
| 204 |
-
-
|
| 205 |
-
-
|
| 206 |
-
- signature_verification: start_page, end_page
|
| 207 |
-
- stamp_detection: start_page, end_page
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
User request: {user_input}
|
| 210 |
File: {file_path or "user uploaded document"}
|
| 211 |
|
| 212 |
-
Return ONLY valid JSON
|
| 213 |
{{
|
| 214 |
"pipeline_name": "descriptive-name",
|
| 215 |
"components": [
|
| 216 |
-
{{
|
| 217 |
-
"tool_name": "extract_text",
|
| 218 |
-
"start_page": 1,
|
| 219 |
-
"end_page": 5,
|
| 220 |
-
"params": {{}}
|
| 221 |
-
}}
|
| 222 |
],
|
| 223 |
"target_lang": null,
|
| 224 |
"reason": "explanation",
|
| 225 |
"metadata": {{"estimated_duration_seconds": 30}}
|
| 226 |
-
}}
|
|
|
|
|
|
|
| 227 |
|
| 228 |
try:
|
| 229 |
response = requests.post(
|
|
|
|
| 60 |
try:
|
| 61 |
llm = ChatBedrock(
|
| 62 |
model_id="mistral.mistral-large-2402-v1:0",
|
| 63 |
+
region_name=os.getenv("AWS_REGION", "ap-south-1") # Default to Mumbai region (nearest)
|
| 64 |
)
|
| 65 |
|
| 66 |
prompt = ChatPromptTemplate.from_messages([
|
| 67 |
+
("system", """You are MasterLLM, a document processing pipeline orchestrator.
|
| 68 |
+
|
| 69 |
+
**YOUR ROLE:**
|
| 70 |
+
You are a helpful AI assistant that can have normal conversations AND create document processing pipelines when asked.
|
| 71 |
+
You should ONLY create pipelines when the user explicitly requests document processing operations.
|
| 72 |
+
For general questions, greetings, or information requests - just have a normal conversation.
|
| 73 |
+
|
| 74 |
+
**STRICT TOOL LIST - USE ONLY THESE TOOLS:**
|
| 75 |
+
1. extract_text (Extract text from PDFs/images)
|
| 76 |
+
- start_page, end_page
|
| 77 |
+
- params: {{"encoding": "utf-8", "preserve_layout": true/false}}
|
| 78 |
+
|
| 79 |
+
2. extract_tables (Extract tables from documents)
|
| 80 |
+
- start_page, end_page
|
| 81 |
+
- params: {{"format": "json" or "csv", "include_headers": true/false}}
|
| 82 |
+
|
| 83 |
+
3. describe_images (Generate descriptions of images)
|
| 84 |
+
- start_page, end_page
|
| 85 |
- params: {{"detail_level": "low"|"medium"|"high"}}
|
| 86 |
|
| 87 |
+
4. summarize (Summarize extracted text)
|
| 88 |
+
- start_page: 1, end_page: 1 (always)
|
| 89 |
+
- params: {{"max_length": 500, "style": "concise" or "detailed"}}
|
| 90 |
|
| 91 |
+
5. classify (Classify document content)
|
| 92 |
+
- start_page: 1, end_page: 1 (always)
|
| 93 |
+
- params: {{"categories": ["list", "of", "categories"]}}
|
| 94 |
|
| 95 |
+
6. ner (Named Entity Recognition - people, places, orgs)
|
| 96 |
+
- start_page: 1, end_page: 1 (always)
|
| 97 |
+
- params: {{"entity_types": ["PERSON", "ORG", "LOC", "DATE"]}}
|
| 98 |
|
| 99 |
+
7. translator (Translate text to another language)
|
| 100 |
+
- start_page: 1, end_page: 1 (always)
|
| 101 |
+
- params: {{"target_lang": "es"|"fr"|"de" etc, "source_lang": "auto"}}
|
| 102 |
|
| 103 |
+
8. signature_verification (Detect and verify signatures)
|
| 104 |
+
- start_page, end_page
|
|
|
|
| 105 |
- params: {{}}
|
| 106 |
|
| 107 |
+
9. stamp_detection (Detect stamps/seals)
|
| 108 |
+
- start_page, end_page
|
|
|
|
| 109 |
- params: {{}}
|
| 110 |
|
| 111 |
+
**CRITICAL RULES:**
|
| 112 |
+
- NEVER use tools not in this list (e.g., NO "extract_entities", "summarize_text", "translate_text")
|
| 113 |
+
- Use "ner" for entity extraction (NOT "extract_entities")
|
| 114 |
+
- Use "summarize" (NOT "summarize_text")
|
| 115 |
+
- Use "translator" (NOT "translate_text")
|
| 116 |
+
- Use "classify" (NOT "classify_text")
|
| 117 |
+
- For text-processing tools (summarize, ner, translator, classify): ALWAYS use start_page=1, end_page=1
|
| 118 |
+
- For extraction tools (extract_text, extract_tables, images, signatures, stamps): use actual page ranges
|
| 119 |
+
|
| 120 |
+
Return ONLY valid JSON:
|
| 121 |
{{
|
| 122 |
"pipeline_name": "descriptive-name",
|
| 123 |
"components": [
|
| 124 |
+
{{"tool_name": "extract_text", "start_page": 1, "end_page": 5, "params": {{"encoding": "utf-8"}}}},
|
| 125 |
+
{{"tool_name": "summarize", "start_page": 1, "end_page": 1, "params": {{"max_length": 500}}}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
],
|
| 127 |
"target_lang": null,
|
| 128 |
+
"reason": "Brief explanation",
|
| 129 |
+
"metadata": {{"estimated_duration_seconds": 30}}
|
|
|
|
|
|
|
| 130 |
}}
|
| 131 |
|
| 132 |
+
Always validate tool_name against the strict list above!"""),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
("human", "User request: {input}\n\nFile: {file_path}")
|
| 134 |
])
|
| 135 |
|
|
|
|
| 186 |
if not GEMINI_API_KEY:
|
| 187 |
raise RuntimeError("Gemini API key not configured")
|
| 188 |
|
| 189 |
+
prompt = f"""You are MasterLLM pipeline generator.
|
| 190 |
+
|
| 191 |
+
STRICT TOOL LIST (USE ONLY THESE):
|
| 192 |
+
- extract_text (pages: start_page, end_page)
|
| 193 |
+
- extract_tables (pages: start_page, end_page)
|
| 194 |
+
- describe_images (pages: start_page, end_page)
|
| 195 |
+
- summarize (always: start_page=1, end_page=1)
|
| 196 |
+
- classify (always: start_page=1, end_page=1)
|
| 197 |
+
- ner (always: start_page=1, end_page=1) - for entity extraction
|
| 198 |
+
- translator (always: start_page=1, end_page=1)
|
| 199 |
+
- signature_verification (pages: start_page, end_page)
|
| 200 |
+
- stamp_detection (pages: start_page, end_page)
|
| 201 |
+
|
| 202 |
+
DO NOT USE: extract_entities, summarize_text, translate_text, classify_text
|
| 203 |
+
USE CORRECT NAMES: ner (not extract_entities), summarize (not summarize_text)
|
| 204 |
|
| 205 |
User request: {user_input}
|
| 206 |
File: {file_path or "user uploaded document"}
|
| 207 |
|
| 208 |
+
Return ONLY valid JSON:
|
| 209 |
{{
|
| 210 |
"pipeline_name": "descriptive-name",
|
| 211 |
"components": [
|
| 212 |
+
{{"tool_name": "extract_text", "start_page": 1, "end_page": 5, "params": {{}}}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
],
|
| 214 |
"target_lang": null,
|
| 215 |
"reason": "explanation",
|
| 216 |
"metadata": {{"estimated_duration_seconds": 30}}
|
| 217 |
+
}}
|
| 218 |
+
|
| 219 |
+
VALIDATE all tool_name values against the strict list!"""
|
| 220 |
|
| 221 |
try:
|
| 222 |
response = requests.post(
|