subashpoudel commited on
Commit
c1c6de7
·
1 Parent(s): 4f85312

Implemented easy ocr

Browse files
api/routers/reply_generator.py CHANGED
@@ -16,7 +16,7 @@ class UserRequest(BaseModel):
16
  tones: Optional[List[str]] = None
17
 
18
  @router.post("/reply-generator")
19
- async def generate_reply(request: UserRequest):
20
  """
21
  Endpoint to extract conversation from a base64-encoded image
22
  """
 
16
  tones: Optional[List[str]] = None
17
 
18
  @router.post("/reply-generator")
19
+ def generate_reply(request: UserRequest):
20
  """
21
  Endpoint to extract conversation from a base64-encoded image
22
  """
requirements.txt CHANGED
@@ -7,4 +7,7 @@ requests
7
  fastapi
8
  uvicorn
9
  python-dotenv
10
- streamlit
 
 
 
 
7
  fastapi
8
  uvicorn
9
  python-dotenv
10
+ streamlit
11
+ easyocr
12
+ opencv-python
13
+ pillow
src/__init__.py ADDED
File without changes
src/conversation_extractor/extractor.py CHANGED
@@ -1,11 +1,15 @@
1
 
2
  from google import genai
3
  from google.genai import types
4
- from src.utils.models_loader import ocr_llm
5
  from .prompts import prompt
6
-
 
 
 
7
  from dotenv import load_dotenv
8
  import os
 
9
 
10
  load_dotenv()
11
  os.environ['GOOGLE_API_KEY']=os.getenv('GOOGLE_API_KEY')
@@ -15,18 +19,42 @@ class ConversationExtractor:
15
  self.client = genai.Client()
16
  self.model_name = ocr_llm
17
  self.prompt = prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- def extract_conversation(self, image_bytes: bytes) -> str:
20
- """
21
- Extract conversation text from an image.
22
- :param image_bytes: Binary content of the image
23
- :return: Extracted conversation text
24
- """
25
- response = self.client.models.generate_content(
26
- model=self.model_name,
27
- contents=[
28
- types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"),
29
- self.prompt
30
- ]
 
31
  )
32
- return response.text
 
 
 
 
 
 
 
 
 
 
1
 
2
  from google import genai
3
  from google.genai import types
4
+ from src.utils.models_loader import ocr_llm , reply_llm , reader
5
  from .prompts import prompt
6
+ import easyocr
7
+ from PIL import Image
8
+ import io
9
+ import numpy as np
10
  from dotenv import load_dotenv
11
  import os
12
+ from langchain_core.messages import HumanMessage , SystemMessage
13
 
14
  load_dotenv()
15
  os.environ['GOOGLE_API_KEY']=os.getenv('GOOGLE_API_KEY')
 
19
  self.client = genai.Client()
20
  self.model_name = ocr_llm
21
  self.prompt = prompt
22
+
23
+ def complete_ocr(self, image_bytes:bytes , lang_list=['en']):
24
+ img = Image.open(io.BytesIO(image_bytes))
25
+ w_img = img.width
26
+
27
+ # Convert PIL image to RGB and read it directly using EasyOCR
28
+ results = reader.readtext(np.array(img)) # Use numpy array instead of file path
29
+
30
+ conversation = []
31
+ for bbox, text, conf in results:
32
+ # bbox: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
33
+ x_coords = [p[0] for p in bbox]
34
+ y_coords = [p[1] for p in bbox]
35
+ x, y, w, h = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)-min(x_coords)), int(max(y_coords)-min(y_coords))
36
 
37
+ # Determine left or right speaker
38
+ speaker = "A" if (x + w/2) < w_img/2 else "B"
39
+
40
+ conversation.append({
41
+ "speaker": speaker,
42
+ "text": text.strip(),
43
+ "box": [x, y, w, h]
44
+ })
45
+
46
+ # Sort top to bottom
47
+ raw_ocr = sorted(conversation, key=lambda x: x["box"][1])
48
+ raw_ocr_text = "Detected Conversation:\n" + "\n".join(
49
+ [f"Line {i}: {turn['text']}" for i, turn in enumerate(raw_ocr, start=1)]
50
  )
51
+ return raw_ocr_text
52
+
53
+ def extract_conversation(self, image_bytes:bytes):
54
+ raw_ocr_text=self.complete_ocr(image_bytes)
55
+ messages = [SystemMessage(content = prompt),
56
+ HumanMessage(content = raw_ocr_text)]
57
+ response =reply_llm.invoke(messages)
58
+ print('The cleaned ocr:', response.content)
59
+ return response.content
60
+
src/conversation_extractor/prompts.py CHANGED
@@ -1,8 +1,27 @@
 
 
 
1
 
2
- prompt = (
3
- "Extract only the conversation text between two persons from this image. "
4
- "Ignore timestamps, emojis, or extra elements. "
5
- "Preserve dialogue order. "
6
- "If names exist, use them; otherwise label speakers Person A and Person B. "
7
- "Output strictly as chat dialogue lines. Return only dialogue."
8
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt = '''
2
+ You are a text cleaner and conversation extractor.
3
+ You will be given raw OCR text from a screenshot of a chat. The text may contain:
4
 
5
+ - Timestamps (e.g., "05:32 PM", "05.33 PM")
6
+ - System messages of mobile phone (e.g., "New Contact", "New Message")
7
+ - Battery percentage, signal info, or other UI elements
8
+ - Numbers or non-message text
9
+
10
+ Your task is to extract **only the actual conversation messages exchanged between the two people**, in the **order they appear**.
11
+
12
+ **Rules:**
13
+ 1. Remove all system messages, timestamps, numbers, and noise.
14
+ 2. Keep the text messages only.
15
+ 3. Keep the order of messages intact.
16
+ "
17
+
18
+
19
+ **Output Format:**
20
+ A clean list of messages between the two people, one message per line. Format like this:
21
+
22
+ Line 1: Hello!
23
+ Line 2: Hi, how are you?
24
+ Line 3: I'm good, thanks! And you?
25
+ ...
26
+
27
+ '''
src/utils/models_loader.py CHANGED
@@ -1,8 +1,14 @@
1
  from langchain_groq import ChatGroq
2
  from dotenv import load_dotenv
3
  import os
 
4
 
5
  load_dotenv()
6
  os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
7
  reply_llm = ChatGroq(model='llama-3.1-8b-instant')
8
- ocr_llm = "gemini-2.5-flash"
 
 
 
 
 
 
1
  from langchain_groq import ChatGroq
2
  from dotenv import load_dotenv
3
  import os
4
+ import easyocr
5
 
6
  load_dotenv()
7
  os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
8
  reply_llm = ChatGroq(model='llama-3.1-8b-instant')
9
+ ocr_llm = "gemini-2.5-flash"
10
+
11
+
12
+
13
+ lang_list = ['en']
14
+ reader = easyocr.Reader(lang_list, gpu=True)