Rishabh2095 commited on
Commit
f57fe35
·
1 Parent(s): daec63e

Added feature in resume loader to allow uploding resume from hugging face datasets

Browse files
.gitignore CHANGED
@@ -46,6 +46,7 @@ requirements.txt
46
  docker-compose.override.example.yml
47
  DOCKERFILE_EXPLANATION.md
48
  DEPLOYMENT_GUIDE.md
 
49
 
50
  # Binary files (PDFs, images, etc.)
51
  *.pdf
 
46
  docker-compose.override.example.yml
47
  DOCKERFILE_EXPLANATION.md
48
  DEPLOYMENT_GUIDE.md
49
+ ./src/job_writing_agent/logs/*.log
50
 
51
  # Binary files (PDFs, images, etc.)
52
  *.pdf
src/job_writing_agent/nodes/resume_loader.py CHANGED
@@ -7,9 +7,13 @@ the resume file and returning the resume in the required format.
7
  """
8
 
9
  import logging
10
- from typing import Callable, Any, Optional
 
11
 
12
- from job_writing_agent.utils.document_processing import parse_resume
 
 
 
13
  from job_writing_agent.utils.logging.logging_decorators import (
14
  log_async,
15
  log_errors,
@@ -55,8 +59,8 @@ class ResumeLoader:
55
  Parameters
56
  ----------
57
  resume_source: Any
58
- Path or file-like object accepted by the parser function.
59
- Can be a file path, URL, or file-like object.
60
 
61
  Returns
62
  -------
@@ -74,7 +78,10 @@ class ResumeLoader:
74
  resume_text = ""
75
  assert resume_source is not None, "resume_source cannot be None"
76
 
77
- resume_chunks = self._parser(resume_source)
 
 
 
78
 
79
  for chunk in resume_chunks:
80
  if hasattr(chunk, "page_content") and chunk.page_content:
 
7
  """
8
 
9
  import logging
10
+ from pathlib import Path
11
+ from typing import Any, Callable, Optional
12
 
13
+ from job_writing_agent.utils.document_processing import (
14
+ get_resume as get_resume_docs,
15
+ parse_resume,
16
+ )
17
  from job_writing_agent.utils.logging.logging_decorators import (
18
  log_async,
19
  log_errors,
 
59
  Parameters
60
  ----------
61
  resume_source: Any
62
+ Path, URL, or file-like object. Supports local paths, HTTP/HTTPS URLs,
63
+ and HuggingFace Hub dataset references (e.g., "username/dataset::resume.pdf").
64
 
65
  Returns
66
  -------
 
78
  resume_text = ""
79
  assert resume_source is not None, "resume_source cannot be None"
80
 
81
+ if isinstance(resume_source, (str, Path)):
82
+ resume_chunks = await get_resume_docs(resume_source)
83
+ else:
84
+ resume_chunks = self._parser(resume_source)
85
 
86
  for chunk in resume_chunks:
87
  if hasattr(chunk, "page_content") and chunk.page_content:
src/job_writing_agent/utils/document_processing.py CHANGED
@@ -3,14 +3,19 @@ Document processing utilities for parsing resumes and job descriptions.
3
  """
4
 
5
  # Standard library imports
 
6
  import logging
7
  import os
8
  import re
 
9
  from pathlib import Path
 
10
  from urllib.parse import urlparse
11
 
12
  # Third-party imports
13
  import dspy
 
 
14
  from langchain_community.document_loaders import PyPDFLoader, AsyncChromiumLoader
15
  from langchain_community.document_transformers import Html2TextTransformer
16
  from langchain_core.documents import Document
@@ -23,7 +28,12 @@ from pydantic import BaseModel, Field
23
  from typing_extensions import Any
24
 
25
  # Local imports
26
- from .errors import JobDescriptionParsingError, LLMProcessingError, URLExtractionError
 
 
 
 
 
27
 
28
  # Set up logging
29
  logger = logging.getLogger(__name__)
@@ -258,46 +268,167 @@ def _is_heading(line: str) -> bool:
258
  return line.isupper() and len(line.split()) <= 5 and not re.search(r"\d", line)
259
 
260
 
261
- def parse_resume(file_path_or_url: str | Path) -> list[Document]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  """
263
- Load a résumé from PDF or TXT file or URL → list[Document] chunks
264
  (≈400 chars, 50‑char overlap) with {source, section} metadata.
265
-
266
- Supports:
267
- - Local file paths: "/path/to/resume.pdf"
268
- - URLs: "https://example.com/resume.pdf" or "s3://bucket/resume.pdf"
269
  """
270
- import tempfile
271
- import urllib.request
272
-
273
- # Handle URLs
274
- file_path = str(file_path_or_url)
275
- is_url = file_path.startswith(("http://", "https://", "s3://", "gs://"))
276
- tmp_file_path = None
277
-
278
- if is_url:
279
- logger.info(f"Downloading resume from URL: {file_path}")
280
- # Create temporary file for downloaded resume
281
- file_extension = Path(urlparse(file_path).path).suffix.lower()
282
- if not file_extension:
283
- file_extension = ".pdf" # Default to PDF if extension not in URL
284
-
285
- tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_extension)
286
- tmp_file_path = tmp_file.name
287
- tmp_file.close()
288
-
289
- try:
290
- # Download file from URL
291
- urllib.request.urlretrieve(file_path, tmp_file_path)
292
- file_path = tmp_file_path
293
- logger.info(f"Resume downloaded to temporary file: {file_path}")
294
- except Exception as e:
295
- # Clean up temp file on error
296
- if tmp_file_path and os.path.exists(tmp_file_path):
297
- os.unlink(tmp_file_path)
298
- logger.error(f"Failed to download resume from URL: {e}")
299
- raise ValueError(f"Could not download resume from URL {file_path_or_url}: {e}")
300
-
301
  file_extension = Path(file_path).suffix.lower()
302
 
303
  # Handle different file types
@@ -336,21 +467,53 @@ def parse_resume(file_path_or_url: str | Path) -> list[Document]:
336
  for chunk in splitter.split_text(md_text)
337
  ] # Attach metadata
338
  for doc in chunks:
339
- # Use original source (URL or path) in metadata, not temp file path
340
- doc.metadata.setdefault("source", str(file_path_or_url))
341
  # section already present if header‑splitter was used
342
-
343
- # Clean up temporary file if it was downloaded from URL
344
- if tmp_file_path and os.path.exists(tmp_file_path):
345
- try:
346
- os.unlink(tmp_file_path)
347
- logger.debug(f"Cleaned up temporary file: {tmp_file_path}")
348
- except Exception as e:
349
- logger.warning(f"Failed to clean up temporary file {tmp_file_path}: {e}")
350
-
351
  return chunks
352
 
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  async def get_job_description(file_path_or_url: str) -> Document:
355
  """Parse a job description from a file or URL into chunks.
356
 
 
3
  """
4
 
5
  # Standard library imports
6
+ import asyncio
7
  import logging
8
  import os
9
  import re
10
+ import tempfile
11
  from pathlib import Path
12
+ from typing import Optional
13
  from urllib.parse import urlparse
14
 
15
  # Third-party imports
16
  import dspy
17
+ import httpx
18
+ from huggingface_hub import hf_hub_download
19
  from langchain_community.document_loaders import PyPDFLoader, AsyncChromiumLoader
20
  from langchain_community.document_transformers import Html2TextTransformer
21
  from langchain_core.documents import Document
 
28
  from typing_extensions import Any
29
 
30
  # Local imports
31
+ from .errors import (
32
+ JobDescriptionParsingError,
33
+ LLMProcessingError,
34
+ ResumeDownloadError,
35
+ URLExtractionError,
36
+ )
37
 
38
  # Set up logging
39
  logger = logging.getLogger(__name__)
 
268
  return line.isupper() and len(line.split()) <= 5 and not re.search(r"\d", line)
269
 
270
 
271
+ def _is_huggingface_hub_url(url: str) -> tuple[bool, Optional[str], Optional[str]]:
272
+ """
273
+ Detect if URL or string is a HuggingFace Hub reference and extract repo_id and filename.
274
+
275
+ Args:
276
+ url: URL or string to check (e.g., "https://huggingface.co/datasets/username/dataset/resolve/main/file.pdf"
277
+ or "username/dataset-name::resume.pdf")
278
+
279
+ Returns:
280
+ Tuple of (is_hf_url, repo_id, filename). Returns (False, None, None) if not HF Hub.
281
+ """
282
+ if not url or not isinstance(url, str):
283
+ return (False, None, None)
284
+
285
+ # Custom format: "username/dataset-name::filename"
286
+ if "::" in url and not url.startswith(("http://", "https://")):
287
+ parts = url.split("::", 1)
288
+ if len(parts) == 2 and "/" in parts[0] and parts[1].strip():
289
+ return (True, parts[0].strip(), parts[1].strip())
290
+ return (False, None, None)
291
+
292
+ # HF Hub URL patterns
293
+ if not url.startswith(("http://", "https://")):
294
+ return (False, None, None)
295
+
296
+ parsed = urlparse(url)
297
+ if "huggingface.co" not in parsed.netloc:
298
+ return (False, None, None)
299
+
300
+ # Pattern: /datasets/{username}/{dataset}/resolve/main/{filename}
301
+ # Pattern: /datasets/{username}/{dataset}/blob/main/{filename}
302
+ # Pattern: /{username}/{dataset}/resolve/main/{filename} (models)
303
+ match = re.match(
304
+ r"^/(?:datasets/)?([^/]+)/([^/]+)/(?:resolve|blob)/[^/]+/(.+)$",
305
+ parsed.path,
306
+ )
307
+ if match:
308
+ repo_id = f"{match.group(1)}/{match.group(2)}"
309
+ filename = match.group(3)
310
+ return (True, repo_id, filename)
311
+
312
+ return (False, None, None)
313
+
314
+
315
+ async def download_file_from_hf_hub(
316
+ repo_id: str,
317
+ filename: str,
318
+ repo_type: str = "dataset",
319
+ token: Optional[str] = None,
320
+ cache_dir: Optional[Path] = None,
321
+ ) -> Path:
322
+ """
323
+ Download a file from HuggingFace Hub dataset or repository.
324
+
325
+ Uses the huggingface_hub library with authentication and caching support.
326
+
327
+ Args:
328
+ repo_id: HF Hub repository ID (e.g., "username/dataset-name").
329
+ filename: Name of the file to download (e.g., "resume.pdf").
330
+ repo_type: Type of repository ("dataset" or "model"). Defaults to "dataset".
331
+ token: Optional HF API token. If None, uses HUGGINGFACE_API_KEY env var.
332
+ cache_dir: Optional cache directory. Defaults to HF_HOME env var or system temp.
333
+
334
+ Returns:
335
+ Path to the downloaded file (from cache or new download).
336
+
337
+ Raises:
338
+ ValueError: If repo_id or filename is invalid.
339
+ ResumeDownloadError: If download fails.
340
+ """
341
+ if not repo_id or not isinstance(repo_id, str) or "/" not in repo_id:
342
+ raise ValueError(
343
+ f"Invalid repo_id: {repo_id}. Expected format: username/dataset-name"
344
+ )
345
+ if not filename or not isinstance(filename, str) or not filename.strip():
346
+ raise ValueError("filename must be a non-empty string")
347
+
348
+ hf_token = token or os.getenv("HUGGINGFACE_API_KEY")
349
+ cache = (
350
+ str(cache_dir) if cache_dir else os.getenv("HF_HOME") or tempfile.gettempdir()
351
+ )
352
+
353
+ def _download() -> str:
354
+ return hf_hub_download(
355
+ repo_id=repo_id,
356
+ filename=filename.strip(),
357
+ repo_type=repo_type,
358
+ token=hf_token,
359
+ cache_dir=cache,
360
+ )
361
+
362
+ try:
363
+ logger.info("Downloading %s from HF Hub repo %s", filename, repo_id)
364
+ local_path = await asyncio.to_thread(_download)
365
+ logger.info("Downloaded resume to %s", local_path)
366
+ return Path(local_path)
367
+ except Exception as e:
368
+ logger.error("Failed to download from HF Hub: %s", e)
369
+ raise ResumeDownloadError(
370
+ f"Could not download {filename} from {repo_id}: {e}"
371
+ ) from e
372
+
373
+
374
+ async def download_file_from_url(
375
+ url: str,
376
+ save_dir: Optional[Path] = None,
377
+ filename: Optional[str] = None,
378
+ ) -> Path:
379
+ """
380
+ Download a file from an HTTP/HTTPS URL to a local temporary location.
381
+
382
+ Handles generic web URLs (GitHub raw files, public cloud storage, etc.).
383
+ For HuggingFace Hub, use download_file_from_hf_hub() instead.
384
+
385
+ Args:
386
+ url: The URL to download from (must start with http:// or https://).
387
+ save_dir: Optional directory to save file. Defaults to system temp directory.
388
+ filename: Optional filename. If not provided, inferred from URL or uses temp name.
389
+
390
+ Returns:
391
+ Path to the downloaded file.
392
+
393
+ Raises:
394
+ ValueError: If URL format is invalid.
395
+ ResumeDownloadError: If download fails.
396
+ """
397
+ parsed = urlparse(url)
398
+ if not parsed.scheme or not parsed.netloc or parsed.scheme not in ("http", "https"):
399
+ raise ValueError("URL must start with http:// or https://")
400
+
401
+ save_dir = save_dir or Path(tempfile.gettempdir())
402
+ save_dir.mkdir(parents=True, exist_ok=True)
403
+
404
+ if not filename:
405
+ filename = Path(parsed.path).name or "resume.pdf"
406
+
407
+ local_path = save_dir / filename
408
+ logger.info("Downloading resume from URL: %s", url)
409
+
410
+ try:
411
+ async with httpx.AsyncClient(follow_redirects=True) as client:
412
+ response = await client.get(url)
413
+ response.raise_for_status()
414
+ local_path.write_bytes(response.content)
415
+ logger.info("Downloaded resume to %s", local_path)
416
+ return local_path
417
+ except httpx.HTTPError as e:
418
+ logger.error("HTTP error downloading from %s: %s", url, e)
419
+ if local_path.exists():
420
+ local_path.unlink(missing_ok=True)
421
+ raise ResumeDownloadError(f"Could not download from {url}: {e}") from e
422
+ except OSError as e:
423
+ logger.error("Error writing file from %s: %s", url, e)
424
+ raise ResumeDownloadError(f"Could not save file from {url}: {e}") from e
425
+
426
+
427
+ def parse_resume(file_path: str | Path) -> list[Document]:
428
  """
429
+ Load a résumé from PDF or TXT file → list[Document] chunks
430
  (≈400 chars, 50‑char overlap) with {source, section} metadata.
 
 
 
 
431
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  file_extension = Path(file_path).suffix.lower()
433
 
434
  # Handle different file types
 
467
  for chunk in splitter.split_text(md_text)
468
  ] # Attach metadata
469
  for doc in chunks:
470
+ doc.metadata.setdefault("source", str(file_path))
 
471
  # section already present if header‑splitter was used
 
 
 
 
 
 
 
 
 
472
  return chunks
473
 
474
 
475
+ async def get_resume(file_path_or_url: str | Path) -> list[Document]:
476
+ """
477
+ Load a résumé from a local file path or URL.
478
+
479
+ Handles both local files and URLs by downloading if needed, then delegating
480
+ to parse_resume() for parsing. Supports HuggingFace Hub datasets and
481
+ generic HTTP/HTTPS URLs.
482
+
483
+ Args:
484
+ file_path_or_url: Local file path, HF Hub reference, or URL.
485
+ Examples:
486
+ - Local: "/path/to/resume.pdf"
487
+ - HF Hub URL: "https://huggingface.co/datasets/username/dataset/resolve/main/resume.pdf"
488
+ - HF Hub format: "username/dataset-name::resume.pdf"
489
+ - Generic HTTP: "https://example.com/resume.pdf"
490
+
491
+ Returns:
492
+ List of Document chunks with resume content.
493
+
494
+ Raises:
495
+ ResumeDownloadError: If URL download fails.
496
+ ValueError: If file path is invalid or unsupported format.
497
+ """
498
+ source = str(file_path_or_url)
499
+
500
+ # 1. Check if HuggingFace Hub URL or custom format
501
+ is_hf, repo_id, filename = _is_huggingface_hub_url(source)
502
+ if is_hf and repo_id and filename:
503
+ local_path = await download_file_from_hf_hub(repo_id=repo_id, filename=filename)
504
+ return parse_resume(local_path)
505
+
506
+ # 2. Check if generic HTTP/HTTPS URL
507
+ if source.startswith(("http://", "https://")):
508
+ local_path = await download_file_from_url(source)
509
+ return parse_resume(local_path)
510
+
511
+ # 3. Treat as local file path
512
+ return parse_resume(
513
+ Path(source) if isinstance(file_path_or_url, str) else file_path_or_url
514
+ )
515
+
516
+
517
  async def get_job_description(file_path_or_url: str) -> Document:
518
  """Parse a job description from a file or URL into chunks.
519
 
src/job_writing_agent/utils/errors.py CHANGED
@@ -17,4 +17,9 @@ class LLMProcessingError(Exception):
17
 
18
  class JobDescriptionParsingError(Exception):
19
  """Base class for job description parsing errors."""
 
 
 
 
 
20
  pass
 
17
 
18
  class JobDescriptionParsingError(Exception):
19
  """Base class for job description parsing errors."""
20
+ pass
21
+
22
+
23
+ class ResumeDownloadError(Exception):
24
+ """Raised when a resume file cannot be downloaded from a URL."""
25
  pass