File size: 4,971 Bytes
ffcb401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# © 2025 Elena Marziali — Code released under Apache 2.0 license.
# See LICENSE in the repository for details.
# Removal of this copyright is prohibited.

# Evaluate the structure of the AI response from the LLM
def validate_ai_structure(response, expected_fields=("title", "abstract", "url")):
    if not isinstance(response, list):
        return []
    valid_items = []
    for item in response:
        if isinstance(item, dict) and all(k in item for k in expected_fields):
            valid_items.append(item)
    return valid_items

import math

# Compute semantic score of the response
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

def evaluate_score(model_output):
    try:
        score = float(model_output[0])
        return round(sigmoid(score), 3)
    except:
        return 0.0

# Extract text from selected file
def extract_text(file_name, max_chars=5000):
    """
    Extracts text from supported formats (.pdf, .docx, .tsv, .csv).
    Returns only the first max_chars characters.
    """
    extension = file_name.lower().split(".")[-1]

    try:
        if extension == "pdf":
            with pdfplumber.open(file_name) as pdf:
                text = "\n".join([p.extract_text() or "" for p in pdf.pages]).strip()

        elif extension == "docx":
            doc = Document(file_name)
            text = "\n".join([p.text for p in doc.paragraphs]).strip()

        elif extension in ["csv", "tsv"]:
            sep = "," if extension == "csv" else "\t"
            df = pd.read_csv(file_name, sep=sep)
            text = df.to_string(index=False)

        else:
            raise ValueError(f"Unsupported format: .{extension}")

        return text[:max_chars] if text else "No text extracted."

    except Exception as e:
        return f"Error during text extraction: {e}"

# Safely extract textual content from an AIMessage
def extract_text_from_ai(obj):
    """ Safely extracts textual content from an AIMessage object. """
    return getattr(obj, "content", str(obj)).strip()

# Extract figure captions from text
def extract_captions_from_text(text):
    pattern = r"(Figure|Fig\.?)\s*\d+[:\.\-–]?\s*[^\n]+"
    return re.findall(pattern, text, re.IGNORECASE)

# Extract images and captions from a file
def extract_images_with_captions(file_path, output_folder="extracted_figures"):
    os.makedirs(output_folder, exist_ok=True)
    extension = file_path.lower().split(".")[-1]
    images = []
    captions = []

    try:
        if extension == "pdf":
            doc = fitz.open(file_path)
            full_text = "\n".join([p.get_text("text") for p in doc])
            extracted_captions = extract_captions_from_text(full_text)
            count = 0

            for i, page in enumerate(doc):
                for j, img in enumerate(page.get_images(full=True)):
                    base = doc.extract_image(img[0])
                    ext = base["ext"]
                    path = f"{output_folder}/page{i+1}_img{j+1}.{ext}"
                    with open(path, "wb") as f:
                        f.write(base["image"])
                    images.append(path)
                    captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}.{j+1}")
                    count += 1

        elif extension == "docx":
            doc = Document(file_path)
            text = "\n".join([p.text for p in doc.paragraphs])
            extracted_captions = extract_captions_from_text(text)
            count = 0

            for i, rel in enumerate(doc.part._rels):
                relation = doc.part._rels[rel]
                if "image" in relation.target_ref:
                    img_data = relation.target_part.blob
                    name = f"{output_folder}/docx_image_{i+1}.png"
                    with open(name, "wb") as f:
                        f.write(img_data)
                    images.append(name)
                    captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}")
                    count += 1

        else:
            print(f"Unsupported extension: .{extension}")

        print(f"{len(images)} image(s) extracted.")
        return images, captions

    except Exception as e:
        print(f"Error extracting images: {e}")
        return [], []

# Generate semantic coherence note based on score
def generate_note(score):
    if score > 0.85:
        return "High semantic coherence. The response is likely solid and relevant."
    elif score > 0.6:
        return "Moderate coherence. The response is understandable but may contain approximations."
    else:
        return "Low coherence. It may be helpful to rephrase the question or provide more context."

# Simulate LLM response generation
def generate_response(question, temperature=0.7):
    if "Rephrase" in question:
        return "How does enthalpy change during a phase transition?"
    return f"[Simulated response at temperature {temperature} for: {question}]"