File size: 6,163 Bytes
742cc20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# © 2025 Elena Marziali — Code released under Apache 2.0 license.
# See LICENSE in the repository for details.
# Removal of this copyright is prohibited.

# === Text Translation ===

# Caching dictionary for previously translated texts
translation_cache = {}


def detect_language(text):
    """Detects the language of the loaded text."""
    try:
        return detect(text)
    except Exception as e:
        print(f"Language detection error: {e}")
        return "unknown"

def translate_text(text, source_lang, target_lang):
    """ Translates the text with debug output to verify correctness. """
    translation_model = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"

    print(f"Using translation model: {translation_model}")

    translator = pipeline("translation", model=translation_model)

    translation = translator(text)[0]['translation_text']
    print(f"Original text: {text}")
    print(f"Translated text: {translation}")

    return translation

def extract_text_pdf(file_name):
    """ Extracts text from a PDF file. """
    text = ""
    with pdfplumber.open(file_name) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text.strip()

def extract_text_docx(file_name):
    """ Extracts text from a DOCX file. """
    doc = Document(file_name)
    text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
    return text.strip()

def save_docx(text, output_file_name):
    """ Saves translated text into a DOCX document. """
    doc = Document()
    doc.add_paragraph(text)
    doc.save(output_file_name)

def extract_text_csv(file_name):
    """ Extracts textual content from a CSV file. """
    df = pd.read_csv(file_name)
    text = df.astype(str).apply(lambda x: ' '.join(x), axis=1).str.cat(sep='\n')
    return text.strip()

def extract_text_tsv(file_name):
    """ Extracts textual content from a TSV file. """
    df = pd.read_csv(file_name, sep='\t')
    text = df.astype(str).apply(lambda x: ' '.join(x), axis=1).str.cat(sep='\n')
    return text.strip()

def handle_file(file_name):
    """ Loads the file, detects its language, and lets the user choose a target language for translation. """
    extension = file_name.split('.')[-1].lower()

    if extension == "pdf":
        text = extract_text_pdf(file_name)
    elif extension == "docx":
        text = extract_text_docx(file_name)
    elif extension == "csv":
        text = extract_text_csv(file_name)
    elif extension == "tsv":
        text = extract_text_tsv(file_name)
    else:
        return "Unsupported format! Use PDF, DOCX, CSV, or TSV."

    original_language = detect_language(text)
    print(f"The file was detected in **{original_language}**.")

    # List of available languages
    available_languages = ["en", "fr", "de", "es", "zh", "ja", "ar", "it"]

    # Ask the user for the target language
    print(f"Available languages for translation: {', '.join(available_languages)}")
    target_language = input("Which language do you want the explanation in? (e.g., 'en' for English, 'fr' for French): ").strip()

    if target_language not in available_languages:
        print("Error: Unsupported language!")
    else:
        print(f"The explanation will be translated into {target_language}.")

    # Ensure translation is performed
    translated_text = translate_text(text, original_language, target_language)

    # Save the translated file
    translated_file_name = f"translated_{target_language}_{file_name}"
    if extension == "pdf":
        with open(translated_file_name, "w", encoding="utf-8") as f:
            f.write(translated_text)
    elif extension == "docx":
        save_docx(translated_text, translated_file_name)

    return f"Translation completed! Download the file: {translated_file_name}"

# Initialize the dictionary to store journals
journal_store = {}

def save_multilingual_journal(journal_text, journal_id, target_language):
    source_language = detect_language(journal_text)

    if source_language != target_language:
        translated_text = translate_long_text(journal_text, source_lang=source_language, target_lang=target_language)
    else:
        translated_text = journal_text

    journal_store[journal_id] = {
        "original": journal_text,
        target_language: translated_text
    }

    embedding = safe_encode(translated_text)
    index.add(np.array(embedding, dtype=np.float32))



def translate_long_text(text, source_lang="it", target_lang="en", max_chars=400):
    translation_model = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    translator = pipeline("translation", model=translation_model)

    blocks = [text[i:i+max_chars] for i in range(0, len(text), max_chars)]
    translated = []

    for block in blocks:
        try:
            output = translator(block)[0]['translation_text']
            translated.append(output)
        except Exception as e:
            print(f"Error translating block: {e}")
            translated.append("[Translation error]")

    return "\n".join(translated)

def search_similar_journals(query, target_language, top_k=3):
    query_language = detect_language(query)

    if query_language != target_language:
        translated_query = translate_long_text(query, source_lang=query_language, target_lang=target_language)
    else:
        translated_query = query

    query_emb = safe_encode(translated_query)
    query_emb = np.array(query_emb, dtype=np.float32)

    if hasattr(index, "is_trained") and not index.is_trained:
        print("FAISS index is not trained.")
        return []

    D, I = index.search(query_emb, top_k)
    results = []
    for i in I[0]:
        journal = journal_store.get(i, {})
        results.append(journal.get(target_language, ""))
    return results

# === Valid Input Function ===
def get_valid_input(message, valid_options=None):
    while True:
        value = input(message).strip().lower()
        if not value:
            print("Error! Please enter a valid value.")
        elif valid_options and value not in valid_options:
            print(f"Error! You must choose from: {', '.join(valid_options)}")
        else:
            return value