File size: 1,869 Bytes
39028c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
{
  "project": {
    "name": "Intent-Aware Context-Preserving Summarization System",
    "version": "1.0.0",
    "description": "Advanced summarization system for long technical documents using generative AI with RAG pipeline and intelligent model selection",
    "author": "AI Development Team",
    "license": "MIT"
  },
  "model": {
    "primary_model": "t5-small",
    "alternative_models": [
      "t5-small",
      "t5-base",
      "t5-large",
      "mbart-50-small",
      "mt5-small",
      "facebook/bart-base",
      "facebook/bart-large-cnn",
      "google/pegasus-arxiv",
      "google/pegasus-pubmed",
      "allenai/led-base-16384"
    ],
    "device": "auto",
    "max_input_length": 512,
    "max_output_length": 150,
    "min_output_length": 50,
    "num_beams": 2,
    "supported_languages": [
      "english", "spanish", "french", "german", "italian",
      "portuguese", "chinese", "japanese", "korean", "arabic",
      "hindi", "russian", "turkish", "vietnamese", "thai"
    ],
    "default_language": "english"
  },
  "summarization": {
    "intent_types": [
      "technical_overview",
      "detailed_analysis",
      "methodology",
      "results",
      "conclusion",
      "abstract"
    ],
    "chunk_size": 512,
    "chunk_overlap": 50,
    "preserve_context": true
  },
  "preprocessing": {
    "remove_citations": true,
    "remove_equations": false,
    "remove_stopwords": false,
    "clean_text": true,
    "normalize_whitespace": true
  },
  "evaluation": {
    "metrics": ["rouge1", "rouge2", "rougeL", "bert_score"],
    "use_stemmer": true,
    "human_evaluation": true
  },
  "datasets": {
    "sources": [
      "arXiv",
      "PubMed",
      "Scientific Papers"
    ],
    "data_dir": "data/",
    "cache_dir": "models/"
  },
  "output": {
    "results_dir": "results/",
    "save_format": "json",
    "log_level": "INFO"
  }
}