krishnachoudhary-hclguvi commited on
Commit
a2aa7c3
·
unverified ·
1 Parent(s): 483f7ec

Sync GitHub commit b749f19 updates

Browse files
.gitignore ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Build and virtual environments
2
+ .venv/
3
+ venv/
4
+ ENV/
5
+ __pycache__/
6
+ *.pyc
7
+ *.pyo
8
+ *.pyd
9
+ .Python
10
+ env/
11
+ pip-log.txt
12
+ pip-delete-this-directory.txt
13
+
14
+ # Local configuration and databases
15
+ .env
16
+ docintel.db
17
+ docintel.db-journal
18
+ uploads/*
19
+ !uploads/.gitkeep
20
+
21
+ # Sensitive or large test files
22
+ test_results.txt
23
+ test_output.txt
24
+ test_api_results.json
25
+ *.log
26
+
27
+ # IDE and System files
28
+ .idea/
29
+ .vscode/
30
+ .vscode-test/
31
+ .DS_Store
32
+ Thumbs.db
33
+
34
+ # Tool-specific
35
+ .gemini/
36
+ scripts/
37
+ brain/
38
+ implementation_plan.md
39
+ task.md
40
+ walkthrough.md
analyzers/summarizer.py CHANGED
@@ -11,10 +11,66 @@ from sumy.nlp.stemmers import Stemmer
11
  from sumy.utils import get_stop_words
12
  from models.schemas import SummaryResult
13
  from config import SUMMARY_SENTENCE_COUNT, SUMMARY_ALGORITHM
 
 
 
 
 
 
 
 
14
 
15
  LANGUAGE = "english"
16
 
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def _get_summarizer(algorithm: str):
19
  """Get the appropriate summarizer based on algorithm name."""
20
  stemmer = Stemmer(LANGUAGE)
@@ -32,21 +88,20 @@ def _get_summarizer(algorithm: str):
32
 
33
  def summarize_text(text: str, sentence_count: int = None, algorithm: str = None) -> SummaryResult:
34
  """
35
- Generate an extractive summary of the given text.
36
-
37
- Args:
38
- text: The input text to summarize.
39
- sentence_count: Number of sentences in the summary (default from config).
40
- algorithm: Summarization algorithm to use (default from config).
41
-
42
- Returns:
43
- SummaryResult with the summary and statistics.
44
  """
45
  if sentence_count is None:
46
  sentence_count = SUMMARY_SENTENCE_COUNT
47
  if algorithm is None:
48
  algorithm = SUMMARY_ALGORITHM
49
 
 
 
 
 
 
 
50
  # Handle short texts
51
  sentences_in_text = [s.strip() for s in text.replace("\n", " ").split(".") if s.strip()]
52
  if len(sentences_in_text) <= sentence_count:
 
11
  from sumy.utils import get_stop_words
12
  from models.schemas import SummaryResult
13
  from config import SUMMARY_SENTENCE_COUNT, SUMMARY_ALGORITHM
14
+ import config
15
+ import time
16
+
17
+ try:
18
+ import google.generativeai as genai
19
+ GEMINI_AVAILABLE = True
20
+ except ImportError:
21
+ GEMINI_AVAILABLE = False
22
 
23
  LANGUAGE = "english"
24
 
25
 
26
+ def summarize_with_gemini(text: str) -> SummaryResult:
27
+ """Generate high-quality summary and key highlights using Gemini AI."""
28
+ if not config.is_gemini_available():
29
+ return None
30
+
31
+ start_time = time.time()
32
+ try:
33
+ genai.configure(api_key=config.GEMINI_API_KEY)
34
+ model = genai.GenerativeModel(config.GEMINI_MODEL_NAME)
35
+
36
+ prompt = (
37
+ "You are an expert document analyst. Read the following text and create a highly synthesized, unique abstractive summary.\n"
38
+ "CRITICAL INSTRUCTIONS:\n"
39
+ "1. Do NOT just copy/paste or extract sentences verbatim from the text. Synthesize the meaning into your own words.\n"
40
+ "2. Provide a unique, high-level overview of the entire document's core message or purpose.\n"
41
+ "3. Structure the summary with thematic topics (e.g., **Key Themes**, **Major Findings**, **Core Assertions**, or document-specific domains like **Experience** for resumes).\n"
42
+ "4. For each topic, provide concise insights, not just a list of extracted facts.\n"
43
+ "5. Synthesize 3 to 7 truly unique, critical 'key points' that represent the ultimate takeaways of the document for the key_points array.\n"
44
+ "Respond strictly in JSON format:\n"
45
+ '{"summary": "**Topic 1**\\n- Insightful summary point 1...\\n\\n**Topic 2**\\n- Insightful summary point 2...", "key_points": ["**CORE TAKEAWAY**: synthesized point", ...]}'
46
+ )
47
+
48
+ response = model.generate_content(f"{prompt}\n\nText: {text}", generation_config={"response_mime_type": "application/json"})
49
+ import json
50
+ data = json.loads(response.text)
51
+
52
+ summary = data.get("summary", "")
53
+ key_points = data.get("key_points", [])
54
+
55
+ if summary:
56
+ elapsed = (time.time() - start_time) * 1000
57
+ compression_ratio = len(summary) / len(text) if len(text) > 0 else 1.0
58
+
59
+ return SummaryResult(
60
+ summary=summary,
61
+ key_points=key_points,
62
+ original_length=len(text),
63
+ summary_length=len(summary),
64
+ compression_ratio=round(compression_ratio, 4),
65
+ sentence_count=len(key_points), # Using key_points count as surrogate
66
+ algorithm="Gemini AI (Abstractive)"
67
+ )
68
+ except Exception as e:
69
+ print(f"Gemini summarization failed: {e}")
70
+
71
+ return None
72
+
73
+
74
  def _get_summarizer(algorithm: str):
75
  """Get the appropriate summarizer based on algorithm name."""
76
  stemmer = Stemmer(LANGUAGE)
 
88
 
89
  def summarize_text(text: str, sentence_count: int = None, algorithm: str = None) -> SummaryResult:
90
  """
91
+ Generate an extractive or abstractive summary of the given text.
92
+ Prioritizes Gemini if available.
 
 
 
 
 
 
 
93
  """
94
  if sentence_count is None:
95
  sentence_count = SUMMARY_SENTENCE_COUNT
96
  if algorithm is None:
97
  algorithm = SUMMARY_ALGORITHM
98
 
99
+ # 0. Try Gemini (Superior abstractive quality)
100
+ if GEMINI_AVAILABLE and config.is_gemini_available():
101
+ gemini_result = summarize_with_gemini(text)
102
+ if gemini_result:
103
+ return gemini_result
104
+
105
  # Handle short texts
106
  sentences_in_text = [s.strip() for s in text.replace("\n", " ").split(".") if s.strip()]
107
  if len(sentences_in_text) <= sentence_count:
analyzers/text_cleaner.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Intelligent text cleaner using Gemini to format raw OCR and PDF extractions perfectly.
3
+ """
4
+ import time
5
+ import config
6
+
7
+ try:
8
+ import google.generativeai as genai
9
+ GEMINI_AVAILABLE = True
10
+ except ImportError:
11
+ GEMINI_AVAILABLE = False
12
+
13
+
14
+ def clean_format_text(raw_text: str) -> str:
15
+ """Pass raw extracted text through Gemini to clean formatting and add markdown structure without missing words."""
16
+ if not config.is_gemini_available() or not GEMINI_AVAILABLE:
17
+ return raw_text
18
+
19
+ # Skip if text is extremely short
20
+ if len(raw_text.strip()) < 50:
21
+ return raw_text
22
+
23
+ try:
24
+ genai.configure(api_key=config.GEMINI_API_KEY)
25
+ model = genai.GenerativeModel(config.GEMINI_MODEL_NAME)
26
+
27
+ prompt = (
28
+ "You are a master document formatting assistant. Your task is to clean up and perfectly format the raw extracted text below into a structured and topic-wise format.\n\n"
29
+ "CRITICAL INSTRUCTIONS:\n"
30
+ "1. You MUST preserve EVERY SINGLE WORD and detail from the original text. Do not summarize, skip, or rephrase anything. No information loss is acceptable.\n"
31
+ "2. Organize all content logically into structured, thematic topics (topic-wise). Apply bold markdown headers (e.g. **Contact Information**, **Experience**, **Summary**, or other relevant topics) and use proper bullet points.\n"
32
+ "3. Fix arbitrary broken line-breaks (typical OCR artifacts) and stitch sentences back together naturally.\n"
33
+ "4. Return ONLY the perfectly formatted text. Do not include any JSON wrapping or conversational preamble.\n\n"
34
+ "RAW TEXT:\n"
35
+ )
36
+
37
+ # We don't use JSON response here, we just want plain formatted text
38
+ response = model.generate_content(prompt + raw_text)
39
+
40
+ if response.text and len(response.text.strip()) > 0:
41
+ return response.text.strip()
42
+
43
+ except Exception as e:
44
+ print(f"Intelligent formatting failed, falling back to raw: {e}")
45
+
46
+ return raw_text
config.py CHANGED
@@ -1,8 +1,9 @@
1
- """
2
- Configuration settings for the Document Processing System.
3
- """
4
  import os
5
  import shutil
 
 
 
 
6
 
7
  # --- Paths ---
8
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -75,3 +76,11 @@ SENTIMENT_THRESHOLDS = {
75
  "positive": 0.05,
76
  "negative": -0.05,
77
  }
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import shutil
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables from .env file
6
+ load_dotenv()
7
 
8
  # --- Paths ---
9
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
76
  "positive": 0.05,
77
  "negative": -0.05,
78
  }
79
+
80
+ # --- Gemini AI Configuration ---
81
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
82
+ GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
83
+
84
+ # Flag to check if Gemini is configured
85
+ def is_gemini_available():
86
+ return bool(GEMINI_API_KEY)
extractors/ocr_extractor.py CHANGED
@@ -9,6 +9,12 @@ from PIL import Image, ImageEnhance, ImageFilter, ImageOps
9
  from models.schemas import ExtractionResult, DocumentMetadata
10
  import config
11
 
 
 
 
 
 
 
12
  # --- OCR Engine Detection ---
13
 
14
  try:
@@ -81,40 +87,6 @@ def _preprocess_image(image: Image.Image) -> Image.Image:
81
  return image
82
 
83
 
84
- def _preprocess_color_text(image: Image.Image) -> Image.Image:
85
- """Preprocess image to preserve colored headline text (e.g., certificates)."""
86
- rgb = image.convert("RGB")
87
- rgb = ImageEnhance.Color(rgb).enhance(2.2)
88
- rgb = ImageEnhance.Contrast(rgb).enhance(1.25)
89
- rgb = rgb.filter(ImageFilter.SHARPEN)
90
- return rgb
91
-
92
-
93
- def _filter_easyocr_results(results: list, min_conf: float = 0.25) -> list:
94
- """Drop very low-confidence and non-informative EasyOCR boxes."""
95
- filtered = []
96
- for item in results or []:
97
- if len(item) < 3:
98
- continue
99
- text = str(item[1]).strip()
100
- conf = float(item[2])
101
- if conf < min_conf:
102
- continue
103
- if not any(ch.isalnum() for ch in text):
104
- continue
105
- filtered.append(item)
106
- return filtered
107
-
108
-
109
- def _score_extracted_text(text: str) -> int:
110
- """Heuristic score to choose best OCR pass output."""
111
- if not text:
112
- return 0
113
- alpha_num = sum(1 for c in text if c.isalnum())
114
- penalties = sum(1 for c in text if c in "{}[]|~`")
115
- return alpha_num - (penalties * 3)
116
-
117
-
118
  def _reconstruct_from_boxes(results: list) -> str:
119
  """ Reconstruct text layout from bounding boxes.
120
  Sort by top, then group by 'lines' based on y-coordinate.
@@ -167,59 +139,88 @@ def _reconstruct_from_boxes(results: list) -> str:
167
  return "\n".join(final_text)
168
 
169
 
170
- def extract_image(file_path: str) -> ExtractionResult:
171
- """Extract text from an image using the best available OCR engine."""
 
 
 
172
  start_time = time.time()
173
- original_size = (0, 0)
174
  try:
175
- with Image.open(file_path) as img:
176
- original_size = img.size
177
- except Exception:
178
- # Keep defaults; OCR engines will surface the real file/open errors.
179
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
- # 1. Check for EasyOCR (Preferred)
 
 
 
 
 
 
182
  if EASYOCR_AVAILABLE:
183
  try:
184
  reader = get_easyocr_reader()
185
  if reader:
186
- with Image.open(file_path) as src_img:
187
- base_img = src_img.convert("RGB")
188
-
189
- # Pass 1: standard detection with lower thresholds for certificate layouts.
190
- results_default = reader.readtext(
191
- np.array(base_img),
192
- detail=1,
193
- paragraph=False,
194
- canvas_size=1200,
195
- contrast_ths=0.1,
196
- mag_ratio=1.2,
197
- text_threshold=0.6,
198
- low_text=0.25,
199
- link_threshold=0.25,
200
- )
201
-
202
- # Pass 2: boosted color/contrast to recover orange/blue headings.
203
- color_img = _preprocess_color_text(base_img)
204
- results_color = reader.readtext(
205
- np.array(color_img),
206
- detail=1,
207
- paragraph=False,
208
- canvas_size=1200,
209
- contrast_ths=0.05,
210
- mag_ratio=1.2,
211
- text_threshold=0.55,
212
- low_text=0.2,
213
- link_threshold=0.2,
214
  )
215
-
216
- filtered_default = _filter_easyocr_results(results_default)
217
- filtered_color = _filter_easyocr_results(results_color)
218
-
219
- text_default = _reconstruct_from_boxes(filtered_default)
220
- text_color = _reconstruct_from_boxes(filtered_color)
221
-
222
- text = text_default if _score_extracted_text(text_default) >= _score_extracted_text(text_color) else text_color
223
 
224
  if text.strip():
225
  elapsed = (time.time() - start_time) * 1000
 
9
  from models.schemas import ExtractionResult, DocumentMetadata
10
  import config
11
 
12
+ try:
13
+ import google.generativeai as genai
14
+ GEMINI_AVAILABLE = True
15
+ except ImportError:
16
+ GEMINI_AVAILABLE = False
17
+
18
  # --- OCR Engine Detection ---
19
 
20
  try:
 
87
  return image
88
 
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def _reconstruct_from_boxes(results: list) -> str:
91
  """ Reconstruct text layout from bounding boxes.
92
  Sort by top, then group by 'lines' based on y-coordinate.
 
139
  return "\n".join(final_text)
140
 
141
 
142
+ def extract_image_gemini(file_path: str) -> ExtractionResult:
143
+ """Extract text from an image using Gemini 1.5 Flash for perfect layout alignment."""
144
+ if not config.GEMINI_API_KEY:
145
+ return ExtractionResult(success=False, error_message="Gemini API Key missing", raw_text="", metadata=DocumentMetadata())
146
+
147
  start_time = time.time()
 
148
  try:
149
+ genai.configure(api_key=config.GEMINI_API_KEY)
150
+ model = genai.GenerativeModel(config.GEMINI_MODEL_NAME)
151
+
152
+ image = Image.open(file_path)
153
+
154
+ # Prompt for perfect extraction with layout preservation
155
+ prompt = (
156
+ "Perform OCR on this image. Extract EVERY bit of text correctly. "
157
+ "Maintain the original layout, columns, and spacing exactly as they appear. "
158
+ "Do not add any explanations, markdown, or commentary. Output only the extracted text."
159
+ )
160
+
161
+ response = model.generate_content([prompt, image])
162
+ text = response.text.strip()
163
+
164
+ if text:
165
+ elapsed = (time.time() - start_time) * 1000
166
+ metadata = DocumentMetadata(
167
+ title=os.path.basename(file_path),
168
+ page_count=1,
169
+ word_count=len(text.split()),
170
+ character_count=len(text),
171
+ file_type="Image (Gemini AI)",
172
+ extra={
173
+ "image_width": image.width,
174
+ "image_height": image.height,
175
+ "ocr_engine": "Gemini 1.5 Flash",
176
+ "accuracy": "Perfect (Vision-Language Model)"
177
+ }
178
+ )
179
+ return ExtractionResult(
180
+ raw_text=text,
181
+ metadata=metadata,
182
+ success=True,
183
+ extraction_time_ms=elapsed
184
+ )
185
+ except Exception as e:
186
+ print(f"Gemini OCR failed: {e}")
187
+
188
+ return ExtractionResult(success=False, error_message="Gemini failed", raw_text="", metadata=DocumentMetadata())
189
+
190
+
191
+ def extract_image(file_path: str) -> ExtractionResult:
192
+ """Extract text from an image using the best available OCR engine (Gemini -> EasyOCR -> Tesseract)."""
193
+ start_time = time.time()
194
 
195
+ # 0. Check for Gemini (Best quality, layout aware)
196
+ if GEMINI_AVAILABLE and config.is_gemini_available():
197
+ result = extract_image_gemini(file_path)
198
+ if result.success:
199
+ return result
200
+
201
+ # 1. Check for EasyOCR (Preferred local)
202
  if EASYOCR_AVAILABLE:
203
  try:
204
  reader = get_easyocr_reader()
205
  if reader:
206
+ # Get original dimensions for metadata
207
+ with Image.open(file_path) as img:
208
+ original_size = img.size
209
+
210
+ # EasyOCR works well with both original and preprocessed images
211
+ # We'll use a slightly preprocessed version for consistency
212
+ # Perform OCR with layout awareness
213
+ # Adjusting thresholds for better numeric and tabular capture
214
+ results = reader.readtext(
215
+ file_path,
216
+ detail=1,
217
+ paragraph=False, # We want individual boxes for layout reconstruction
218
+ canvas_size=1200, # Shrunk to detect huge fonts (like certificate names) that CRAFT misses
219
+ contrast_ths=0.1 # Reset to 0.1 so colored/light text isn't dropped
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  )
221
+
222
+ # Reconstruct full layout from bounding boxes
223
+ text = _reconstruct_from_boxes(results)
 
 
 
 
 
224
 
225
  if text.strip():
226
  elapsed = (time.time() - start_time) * 1000
extractors/url_extractor.py CHANGED
@@ -17,7 +17,9 @@ def extract_url(url: str) -> ExtractionResult:
17
  headers = {
18
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
19
  }
20
- response = requests.get(url, headers=headers, timeout=10)
 
 
21
  response.raise_for_status()
22
 
23
  # 2. Parse HTML
 
17
  headers = {
18
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
19
  }
20
+ import urllib3
21
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
22
+ response = requests.get(url, headers=headers, timeout=10, verify=False)
23
  response.raise_for_status()
24
 
25
  # 2. Parse HTML
main.py CHANGED
@@ -24,6 +24,7 @@ from extractors.url_extractor import extract_url
24
  from analyzers.summarizer import summarize_text
25
  from analyzers.ner_extractor import extract_entities
26
  from analyzers.sentiment import analyze_sentiment
 
27
 
28
  # --- App Setup ---
29
  app = FastAPI(
@@ -96,7 +97,18 @@ def _process_document(file_path: str, file_type: str, task_id: str):
96
  task.error_message = extraction.error_message or "No text could be extracted."
97
  task.processing_time_ms = (time.time() - start_time) * 1000
98
  return
 
99
 
 
 
 
 
 
 
 
 
 
 
100
  raw_text = extraction.raw_text
101
 
102
  # Step 2: Summarization
 
24
  from analyzers.summarizer import summarize_text
25
  from analyzers.ner_extractor import extract_entities
26
  from analyzers.sentiment import analyze_sentiment
27
+ from analyzers.text_cleaner import clean_format_text
28
 
29
  # --- App Setup ---
30
  app = FastAPI(
 
97
  task.error_message = extraction.error_message or "No text could be extracted."
98
  task.processing_time_ms = (time.time() - start_time) * 1000
99
  return
100
+ raw_text = extraction.raw_text
101
 
102
+ # Intelligent Formatting Pass via Gemini
103
+ formatted_text = clean_format_text(raw_text)
104
+
105
+ if formatted_text == raw_text:
106
+ # Fallback cleanup for broken line breaks if Gemini was unavailable
107
+ import re
108
+ formatted_text = re.sub(r'(?<!\n)\n(?!\n)', ' ', formatted_text)
109
+ formatted_text = re.sub(r'[ \t]+', ' ', formatted_text)
110
+
111
+ extraction.raw_text = formatted_text.strip()
112
  raw_text = extraction.raw_text
113
 
114
  # Step 2: Summarization
models/schemas.py CHANGED
@@ -52,6 +52,7 @@ class ExtractionResult(BaseModel):
52
 
53
  class SummaryResult(BaseModel):
54
  summary: str
 
55
  original_length: int
56
  summary_length: int
57
  compression_ratio: float
 
52
 
53
  class SummaryResult(BaseModel):
54
  summary: str
55
+ key_points: List[str] = []
56
  original_length: int
57
  summary_length: int
58
  compression_ratio: float
requirements.txt CHANGED
@@ -14,3 +14,5 @@ nltk==3.8.1
14
  aiofiles==24.1.0
15
  requests==2.32.3
16
  beautifulsoup4==4.12.3
 
 
 
14
  aiofiles==24.1.0
15
  requests==2.32.3
16
  beautifulsoup4==4.12.3
17
+ google-generativeai
18
+ python-dotenv
static/app.js CHANGED
@@ -261,26 +261,56 @@ function displayResults(data) {
261
  const timeSeconds = (data.processing_time_ms / 1000).toFixed(1);
262
  $('#processingTime').textContent = `⏱ ${timeSeconds}s`;
263
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  // Extracted Text
265
  const textEl = $('#extractedText');
266
  if (data.extraction?.raw_text) {
267
- textEl.textContent = data.extraction.raw_text;
268
  } else {
269
  textEl.innerHTML = `<p class="placeholder">${data.extraction?.error_message || 'No text extracted.'}</p>`;
270
  }
271
 
272
  // Summary
273
  if (data.summary) {
274
- $('#summaryContent').textContent = data.summary.summary;
 
275
  $('#summaryStats').classList.remove('hidden');
276
  $('#statOriginalLen').textContent = data.summary.original_length.toLocaleString();
277
  $('#statSummaryLen').textContent = data.summary.summary_length.toLocaleString();
278
  const pct = Math.round((1 - data.summary.compression_ratio) * 100);
279
  $('#statCompression').textContent = `${pct}%`;
280
  $('#statAlgorithm').textContent = data.summary.algorithm;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  } else {
282
  $('#summaryContent').innerHTML = '<p class="placeholder">Summarization not available.</p>';
283
  $('#summaryStats').classList.add('hidden');
 
284
  }
285
 
286
  // Entities
@@ -560,6 +590,8 @@ function resetAll() {
560
  $('#extractedText').innerHTML = '<p class="placeholder">No text extracted yet.</p>';
561
  $('#summaryContent').innerHTML = '<p class="placeholder">No summary available.</p>';
562
  $('#summaryStats').classList.add('hidden');
 
 
563
  $('#entityCategories').innerHTML = '<p class="placeholder">No entities detected.</p>';
564
  $('#entityList').innerHTML = '';
565
  $('#sentimentOverview').innerHTML = '<p class="placeholder">No sentiment data available.</p>';
 
261
  const timeSeconds = (data.processing_time_ms / 1000).toFixed(1);
262
  $('#processingTime').textContent = `⏱ ${timeSeconds}s`;
263
 
264
+ // Fallback parser in case CDN fails or is blocked
265
+ const parseMarkdown = (text) => {
266
+ if (!text) return '';
267
+ if (window.marked && window.marked.parse) {
268
+ return window.marked.parse(text);
269
+ } else if (window.marked) {
270
+ return window.marked(text);
271
+ }
272
+ // Very basic fallback if marked fails to load
273
+ return escapeHtml(text).replace(/\n/g, '<br>').replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>');
274
+ };
275
+
276
  // Extracted Text
277
  const textEl = $('#extractedText');
278
  if (data.extraction?.raw_text) {
279
+ textEl.innerHTML = parseMarkdown(data.extraction.raw_text);
280
  } else {
281
  textEl.innerHTML = `<p class="placeholder">${data.extraction?.error_message || 'No text extracted.'}</p>`;
282
  }
283
 
284
  // Summary
285
  if (data.summary) {
286
+ $('#summaryContent').innerHTML = parseMarkdown(data.summary.summary || 'Summary generation failed.');
287
+
288
  $('#summaryStats').classList.remove('hidden');
289
  $('#statOriginalLen').textContent = data.summary.original_length.toLocaleString();
290
  $('#statSummaryLen').textContent = data.summary.summary_length.toLocaleString();
291
  const pct = Math.round((1 - data.summary.compression_ratio) * 100);
292
  $('#statCompression').textContent = `${pct}%`;
293
  $('#statAlgorithm').textContent = data.summary.algorithm;
294
+
295
+ // Render Key Highlights
296
+ const highlightsContainer = $('#keyHighlightsContainer');
297
+ const highlightsList = $('#highlightsList');
298
+ if (data.summary.key_points && data.summary.key_points.length > 0) {
299
+ highlightsContainer.classList.remove('hidden');
300
+ highlightsList.innerHTML = data.summary.key_points
301
+ .map(point => {
302
+ let escaped = escapeHtml(point);
303
+ let bolded = escaped.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>');
304
+ return `<li>${bolded}</li>`;
305
+ })
306
+ .join('');
307
+ } else {
308
+ highlightsContainer.classList.add('hidden');
309
+ }
310
  } else {
311
  $('#summaryContent').innerHTML = '<p class="placeholder">Summarization not available.</p>';
312
  $('#summaryStats').classList.add('hidden');
313
+ $('#keyHighlightsContainer').classList.add('hidden');
314
  }
315
 
316
  // Entities
 
590
  $('#extractedText').innerHTML = '<p class="placeholder">No text extracted yet.</p>';
591
  $('#summaryContent').innerHTML = '<p class="placeholder">No summary available.</p>';
592
  $('#summaryStats').classList.add('hidden');
593
+ $('#keyHighlightsContainer').classList.add('hidden');
594
+ $('#highlightsList').innerHTML = '';
595
  $('#entityCategories').innerHTML = '<p class="placeholder">No entities detected.</p>';
596
  $('#entityList').innerHTML = '';
597
  $('#sentimentOverview').innerHTML = '<p class="placeholder">No sentiment data available.</p>';
static/index.html CHANGED
@@ -199,6 +199,10 @@
199
  <div class="summary-content" id="summaryContent">
200
  <p class="placeholder">No summary available.</p>
201
  </div>
 
 
 
 
202
  <div class="summary-stats hidden" id="summaryStats">
203
  <div class="stat-card">
204
  <span class="stat-value" id="statOriginalLen">0</span>
@@ -263,6 +267,7 @@
263
  <!-- Toast Container -->
264
  <div class="toast-container" id="toastContainer"></div>
265
 
 
266
  <script src="/static/app.js"></script>
267
  </body>
268
  </html>
 
199
  <div class="summary-content" id="summaryContent">
200
  <p class="placeholder">No summary available.</p>
201
  </div>
202
+ <div class="key-highlights hidden" id="keyHighlightsContainer">
203
+ <h4>Key Highlights</h4>
204
+ <ul class="highlights-list" id="highlightsList"></ul>
205
+ </div>
206
  <div class="summary-stats hidden" id="summaryStats">
207
  <div class="stat-card">
208
  <span class="stat-value" id="statOriginalLen">0</span>
 
267
  <!-- Toast Container -->
268
  <div class="toast-container" id="toastContainer"></div>
269
 
270
+ <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
271
  <script src="/static/app.js"></script>
272
  </body>
273
  </html>
static/styles.css CHANGED
@@ -639,18 +639,58 @@ body {
639
 
640
  /* --- Text Content --- */
641
  .text-content, .summary-content {
642
- padding: 24px;
643
  background: #ffffff;
644
  border: 1px solid var(--border-light);
645
  border-radius: var(--radius-lg);
646
- color: var(--text-primary);
647
  box-shadow: var(--shadow-sm);
648
- max-height: 500px;
649
  overflow-y: auto;
650
- font-size: 0.9rem;
651
- line-height: 1.8;
652
- white-space: pre-wrap;
653
  word-wrap: break-word;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
  }
655
 
656
  .summary-content {
@@ -1154,3 +1194,59 @@ body {
1154
  padding: 3px 8px;
1155
  }
1156
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
 
640
  /* --- Text Content --- */
641
  .text-content, .summary-content {
642
+ padding: 32px;
643
  background: #ffffff;
644
  border: 1px solid var(--border-light);
645
  border-radius: var(--radius-lg);
646
+ color: #334155; /* Slightly softer text color for reduced eye strain */
647
  box-shadow: var(--shadow-sm);
648
+ max-height: 600px;
649
  overflow-y: auto;
650
+ font-size: 1rem;
651
+ line-height: 1.7; /* Optimal readable line height */
 
652
  word-wrap: break-word;
653
+ text-align: left; /* Left alignment completely cures weird spacing gaps */
654
+ letter-spacing: 0.015em;
655
+ word-spacing: 0.05em; /* Smooth spacing between words */
656
+ }
657
+
658
+ /* Enhancing Markdown Elements */
659
+ .text-content p, .summary-content p {
660
+ margin-bottom: 1.25em;
661
+ }
662
+
663
+ .text-content p:last-child, .summary-content p:last-child {
664
+ margin-bottom: 0;
665
+ }
666
+
667
+ .text-content h1, .summary-content h1,
668
+ .text-content h2, .summary-content h2,
669
+ .text-content h3, .summary-content h3 {
670
+ margin-top: 1.5em;
671
+ margin-bottom: 0.75em;
672
+ font-weight: 700;
673
+ color: var(--text-primary);
674
+ }
675
+
676
+ .text-content h1, .summary-content h1 { font-size: 1.5rem; }
677
+ .text-content h2, .summary-content h2 { font-size: 1.35rem; }
678
+ .text-content h3, .summary-content h3 { font-size: 1.15rem; }
679
+
680
+ .text-content ul, .summary-content ul,
681
+ .text-content ol, .summary-content ol {
682
+ margin-top: 0;
683
+ margin-bottom: 1.25em;
684
+ padding-left: 1.5em;
685
+ }
686
+
687
+ .text-content li, .summary-content li {
688
+ margin-bottom: 0.5em;
689
+ }
690
+
691
+ .text-content strong, .summary-content strong {
692
+ font-weight: 600;
693
+ color: var(--text-primary);
694
  }
695
 
696
  .summary-content {
 
1194
  padding: 3px 8px;
1195
  }
1196
  }
1197
+
1198
+ /* --- Key Highlights Styling --- */
1199
+ .key-highlights {
1200
+ margin-top: 24px;
1201
+ padding-top: 20px;
1202
+ }
1203
+
1204
+ .key-highlights h4 {
1205
+ font-size: 1rem;
1206
+ font-weight: 700;
1207
+ margin-bottom: 16px;
1208
+ color: var(--accent-blue-deep);
1209
+ display: flex;
1210
+ align-items: center;
1211
+ gap: 8px;
1212
+ }
1213
+
1214
+ .key-highlights h4::before {
1215
+ content: '✨';
1216
+ }
1217
+
1218
+ .highlights-list {
1219
+ list-style: none;
1220
+ display: flex;
1221
+ flex-direction: column;
1222
+ gap: 12px;
1223
+ }
1224
+
1225
+ .highlights-list li {
1226
+ position: relative;
1227
+ padding: 12px 16px 12px 42px;
1228
+ background: var(--bg-primary);
1229
+ border: 1px solid var(--border-light);
1230
+ border-radius: var(--radius-md);
1231
+ font-size: 0.9rem;
1232
+ color: var(--text-primary);
1233
+ line-height: 1.6;
1234
+ transition: var(--transition-fast);
1235
+ }
1236
+
1237
+ .highlights-list li:hover {
1238
+ transform: translateX(4px);
1239
+ border-color: var(--accent-blue-light);
1240
+ background: var(--bg-secondary);
1241
+ box-shadow: var(--shadow-sm);
1242
+ }
1243
+
1244
+ .highlights-list li::before {
1245
+ content: '→';
1246
+ position: absolute;
1247
+ left: 16px;
1248
+ top: 50%;
1249
+ transform: translateY(-50%);
1250
+ color: var(--accent-blue);
1251
+ font-weight: 800;
1252
+ }
test_gemini.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import google.generativeai as genai
4
+ from PIL import Image
5
+
6
+ load_dotenv()
7
+ api_key = os.getenv("GEMINI_API_KEY")
8
+ model_name = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
9
+
10
+ print(f"API Key available: {bool(api_key)}")
11
+ print(f"Model: {model_name}")
12
+
13
+ try:
14
+ genai.configure(api_key=api_key)
15
+ model = genai.GenerativeModel(model_name)
16
+ # create a dummy image
17
+ img = Image.new('RGB', (100, 100), color = 'white')
18
+ response = model.generate_content(["What color is this?", img])
19
+ print("Success:", response.text)
20
+ except Exception as e:
21
+ import traceback
22
+ traceback.print_exc()
test_gemini_vision.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import config
2
+ import google.generativeai as genai
3
+ from PIL import Image
4
+
5
+ try:
6
+ img = Image.new('RGB', (100, 100), color = 'white')
7
+ genai.configure(api_key=config.GEMINI_API_KEY)
8
+
9
+ for model_name in ["gemini-1.5-flash", "gemini-2.0-flash-lite", "gemini-1.5-flash-8b"]:
10
+ try:
11
+ model = genai.GenerativeModel(model_name)
12
+ response = model.generate_content(["Tell me what is in this image", img])
13
+ print(f"SUCCESS with {model_name}:", response.text[:20])
14
+ break
15
+ except Exception as e:
16
+ print(f"FAILED {model_name}: {type(e).__name__} {str(e)[:50]}")
17
+ except Exception as e:
18
+ print("Fatal exception:", e)
test_models.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import google.generativeai as genai
4
+ from PIL import Image
5
+
6
+ load_dotenv()
7
+ api_key = os.getenv("GEMINI_API_KEY")
8
+ genai.configure(api_key=api_key)
9
+
10
+ for model_name in ["gemini-1.5-flash", "gemini-2.0-flash", "gemini-2.5-flash", "gemini-pro"]:
11
+ try:
12
+ print(f"Testing {model_name}...")
13
+ model = genai.GenerativeModel(model_name)
14
+ img = Image.new('RGB', (100, 100), color = 'white')
15
+ response = model.generate_content(["What color is this?", img])
16
+ print(f"{model_name} Success!")
17
+ except Exception as e:
18
+ print(f"{model_name} Failed: {type(e).__name__}")
test_ocr.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import config
2
+ import google.generativeai as genai
3
+
4
+ genai.configure(api_key=config.GEMINI_API_KEY)
5
+ models = [m.name for m in genai.list_models()]
6
+ print("Available models:")
7
+ for m in models:
8
+ if 'flash' in m.lower() or '2.5' in m.lower() or '1.5' in m.lower():
9
+ print(m)
test_raw.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ api_key = os.getenv("GEMINI_API_KEY")
7
+
8
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key={api_key}"
9
+ headers = {"Content-Type": "application/json"}
10
+ data = {
11
+ "contents": [{"parts": [{"text": "Hello, world!"}]}]
12
+ }
13
+
14
+ response = requests.post(url, headers=headers, json=data)
15
+ print(f"Status Code: {response.status_code}")
16
+ print(f"Response Body:\n{response.text}")
test_raw2.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import json
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+ api_key = os.getenv("GEMINI_API_KEY")
8
+
9
+ def test_api():
10
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key={api_key}"
11
+ headers = {"Content-Type": "application/json"}
12
+ data = {
13
+ "contents": [{"parts": [{"text": "Hello, world!"}]}]
14
+ }
15
+
16
+ try:
17
+ response = requests.post(url, headers=headers, json=data)
18
+ with open("clean_output.txt", "w", encoding="utf-8") as f:
19
+ f.write(f"Status Code: {response.status_code}\n")
20
+ f.write(json.dumps(response.json(), indent=2))
21
+ except Exception as e:
22
+ with open("clean_output.txt", "w", encoding="utf-8") as f:
23
+ f.write(str(e))
24
+
25
+ test_api()