Spaces:
Sleeping
Sleeping
Commit
·
c1c6de7
1
Parent(s):
4f85312
Implemented easy ocr
Browse files- api/routers/reply_generator.py +1 -1
- requirements.txt +4 -1
- src/__init__.py +0 -0
- src/conversation_extractor/extractor.py +43 -15
- src/conversation_extractor/prompts.py +26 -7
- src/utils/models_loader.py +7 -1
api/routers/reply_generator.py
CHANGED
|
@@ -16,7 +16,7 @@ class UserRequest(BaseModel):
|
|
| 16 |
tones: Optional[List[str]] = None
|
| 17 |
|
| 18 |
@router.post("/reply-generator")
|
| 19 |
-
|
| 20 |
"""
|
| 21 |
Endpoint to extract conversation from a base64-encoded image
|
| 22 |
"""
|
|
|
|
| 16 |
tones: Optional[List[str]] = None
|
| 17 |
|
| 18 |
@router.post("/reply-generator")
|
| 19 |
+
def generate_reply(request: UserRequest):
|
| 20 |
"""
|
| 21 |
Endpoint to extract conversation from a base64-encoded image
|
| 22 |
"""
|
requirements.txt
CHANGED
|
@@ -7,4 +7,7 @@ requests
|
|
| 7 |
fastapi
|
| 8 |
uvicorn
|
| 9 |
python-dotenv
|
| 10 |
-
streamlit
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
fastapi
|
| 8 |
uvicorn
|
| 9 |
python-dotenv
|
| 10 |
+
streamlit
|
| 11 |
+
easyocr
|
| 12 |
+
opencv-python
|
| 13 |
+
pillow
|
src/__init__.py
ADDED
|
File without changes
|
src/conversation_extractor/extractor.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
|
| 2 |
from google import genai
|
| 3 |
from google.genai import types
|
| 4 |
-
from src.utils.models_loader import ocr_llm
|
| 5 |
from .prompts import prompt
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
import os
|
|
|
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
os.environ['GOOGLE_API_KEY']=os.getenv('GOOGLE_API_KEY')
|
|
@@ -15,18 +19,42 @@ class ConversationExtractor:
|
|
| 15 |
self.client = genai.Client()
|
| 16 |
self.model_name = ocr_llm
|
| 17 |
self.prompt = prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
| 31 |
)
|
| 32 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
from google import genai
|
| 3 |
from google.genai import types
|
| 4 |
+
from src.utils.models_loader import ocr_llm , reply_llm , reader
|
| 5 |
from .prompts import prompt
|
| 6 |
+
import easyocr
|
| 7 |
+
from PIL import Image
|
| 8 |
+
import io
|
| 9 |
+
import numpy as np
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
import os
|
| 12 |
+
from langchain_core.messages import HumanMessage , SystemMessage
|
| 13 |
|
| 14 |
load_dotenv()
|
| 15 |
os.environ['GOOGLE_API_KEY']=os.getenv('GOOGLE_API_KEY')
|
|
|
|
| 19 |
self.client = genai.Client()
|
| 20 |
self.model_name = ocr_llm
|
| 21 |
self.prompt = prompt
|
| 22 |
+
|
| 23 |
+
def complete_ocr(self, image_bytes:bytes , lang_list=['en']):
|
| 24 |
+
img = Image.open(io.BytesIO(image_bytes))
|
| 25 |
+
w_img = img.width
|
| 26 |
+
|
| 27 |
+
# Convert PIL image to RGB and read it directly using EasyOCR
|
| 28 |
+
results = reader.readtext(np.array(img)) # Use numpy array instead of file path
|
| 29 |
+
|
| 30 |
+
conversation = []
|
| 31 |
+
for bbox, text, conf in results:
|
| 32 |
+
# bbox: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
|
| 33 |
+
x_coords = [p[0] for p in bbox]
|
| 34 |
+
y_coords = [p[1] for p in bbox]
|
| 35 |
+
x, y, w, h = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)-min(x_coords)), int(max(y_coords)-min(y_coords))
|
| 36 |
|
| 37 |
+
# Determine left or right speaker
|
| 38 |
+
speaker = "A" if (x + w/2) < w_img/2 else "B"
|
| 39 |
+
|
| 40 |
+
conversation.append({
|
| 41 |
+
"speaker": speaker,
|
| 42 |
+
"text": text.strip(),
|
| 43 |
+
"box": [x, y, w, h]
|
| 44 |
+
})
|
| 45 |
+
|
| 46 |
+
# Sort top to bottom
|
| 47 |
+
raw_ocr = sorted(conversation, key=lambda x: x["box"][1])
|
| 48 |
+
raw_ocr_text = "Detected Conversation:\n" + "\n".join(
|
| 49 |
+
[f"Line {i}: {turn['text']}" for i, turn in enumerate(raw_ocr, start=1)]
|
| 50 |
)
|
| 51 |
+
return raw_ocr_text
|
| 52 |
+
|
| 53 |
+
def extract_conversation(self, image_bytes:bytes):
|
| 54 |
+
raw_ocr_text=self.complete_ocr(image_bytes)
|
| 55 |
+
messages = [SystemMessage(content = prompt),
|
| 56 |
+
HumanMessage(content = raw_ocr_text)]
|
| 57 |
+
response =reply_llm.invoke(messages)
|
| 58 |
+
print('The cleaned ocr:', response.content)
|
| 59 |
+
return response.content
|
| 60 |
+
|
src/conversation_extractor/prompts.py
CHANGED
|
@@ -1,8 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
prompt = '''
|
| 2 |
+
You are a text cleaner and conversation extractor.
|
| 3 |
+
You will be given raw OCR text from a screenshot of a chat. The text may contain:
|
| 4 |
|
| 5 |
+
- Timestamps (e.g., "05:32 PM", "05.33 PM")
|
| 6 |
+
- System messages of mobile phone (e.g., "New Contact", "New Message")
|
| 7 |
+
- Battery percentage, signal info, or other UI elements
|
| 8 |
+
- Numbers or non-message text
|
| 9 |
+
|
| 10 |
+
Your task is to extract **only the actual conversation messages exchanged between the two people**, in the **order they appear**.
|
| 11 |
+
|
| 12 |
+
**Rules:**
|
| 13 |
+
1. Remove all system messages, timestamps, numbers, and noise.
|
| 14 |
+
2. Keep the text messages only.
|
| 15 |
+
3. Keep the order of messages intact.
|
| 16 |
+
"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
**Output Format:**
|
| 20 |
+
A clean list of messages between the two people, one message per line. Format like this:
|
| 21 |
+
|
| 22 |
+
Line 1: Hello!
|
| 23 |
+
Line 2: Hi, how are you?
|
| 24 |
+
Line 3: I'm good, thanks! And you?
|
| 25 |
+
...
|
| 26 |
+
|
| 27 |
+
'''
|
src/utils/models_loader.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
| 1 |
from langchain_groq import ChatGroq
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
import os
|
|
|
|
| 4 |
|
| 5 |
load_dotenv()
|
| 6 |
os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
|
| 7 |
reply_llm = ChatGroq(model='llama-3.1-8b-instant')
|
| 8 |
-
ocr_llm = "gemini-2.5-flash"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from langchain_groq import ChatGroq
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
import os
|
| 4 |
+
import easyocr
|
| 5 |
|
| 6 |
load_dotenv()
|
| 7 |
os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
|
| 8 |
reply_llm = ChatGroq(model='llama-3.1-8b-instant')
|
| 9 |
+
ocr_llm = "gemini-2.5-flash"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
lang_list = ['en']
|
| 14 |
+
reader = easyocr.Reader(lang_list, gpu=True)
|