Ilia Tambovtsev commited on
Commit
5627b7a
·
1 Parent(s): bf51a26

feat: implement VisionAnalysisChain

Browse files
Files changed (1) hide show
  1. src/pdf_utils/chains.py +63 -3
src/pdf_utils/chains.py CHANGED
@@ -9,9 +9,9 @@ from langchain.chat_models import ChatOpenAI
9
  from langchain.prompts import ChatPromptTemplate
10
  from langchain.schema.messages import HumanMessage
11
  from langchain.callbacks.manager import CallbackManagerForChainRun
12
- # Import batch processing utilities
13
- # from langchain.chains.batch import BatchedChain
14
- # from langchain.chains.transform import TransformChainMixin
15
  import pdf2image
16
 
17
  from config.navigator import Navigator
@@ -102,3 +102,63 @@ class ImageLoaderChain(Chain):
102
  return {"image": image_base64}
103
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  from langchain.prompts import ChatPromptTemplate
10
  from langchain.schema.messages import HumanMessage
11
  from langchain.callbacks.manager import CallbackManagerForChainRun
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ from langchain.pydantic_v1 import Extra
14
+
15
  import pdf2image
16
 
17
  from config.navigator import Navigator
 
102
  return {"image": image_base64}
103
 
104
 
105
+ class VisionAnalysisChain(Chain):
106
+ """Single image analysis chain"""
107
+
108
+ @property
109
+ def input_keys(self) -> List[str]:
110
+ """Required input keys for the chain"""
111
+ return ["image"]
112
+
113
+ @property
114
+ def output_keys(self) -> List[str]:
115
+ """Output keys provided by the chain"""
116
+ return ["analysis"]
117
+
118
+ def __init__(
119
+ self,
120
+ llm: ChatOpenAI,
121
+ prompt: str = "Describe this slide in detail",
122
+ **kwargs
123
+ ):
124
+ """Initialize the chain with vision capabilities
125
+
126
+ Args:
127
+ llm: Language model with vision capabilities (e.g. GPT-4V)
128
+ prompt: Custom prompt for slide analysis
129
+ """
130
+ super().__init__(**kwargs)
131
+
132
+ # Store components as instance variables without class-level declarations
133
+ self._llm = llm
134
+ self._prompt = prompt
135
+
136
+ self._vision_prompt_template = ChatPromptTemplate.from_messages([
137
+ ("human", [
138
+ {"type": "text", "text": "{prompt}"},
139
+ {
140
+ "type": "image",
141
+ "image_url": "data:image/png;base64,{image}"
142
+ }
143
+ ])
144
+ ])
145
+
146
+ self._chain = (
147
+ self._vision_prompt_template
148
+ | self._llm
149
+ | dict(analysis=StrOutputParser())
150
+ )
151
+
152
+ def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
153
+ """Process single image with the vision model
154
+
155
+ Args:
156
+ inputs: dict(image=<base64 incoded image>)
157
+
158
+ Returns:
159
+ dict(analysis=<textual analysis of image by the model>)
160
+ """
161
+ return self._chain.invoke({
162
+ "prompt": self._prompt,
163
+ "image": inputs["image"]
164
+ })