special-contextual-translator/streamlit_app.py at main · x1001000/special-contextual-translator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import pandas as pd
import json
import time
from call_ai import call_ai
# Sidebar for API key input and model selection
with st.sidebar:
    api_key = st.text_input(
        "這裡輸入你的 OpenAI API Key",
        # help="Enter your OpenAI API key"
    )

    st.markdown("💰 [API 定價](https://openai.com/api/pricing/)")
    model = st.selectbox(
        "這裡選擇 OpenAI 模型",
        options=[
            "gpt-4.1",
            "gpt-4.1-mini",
            "o3",
            "o3-mini",
            "o4-mini",
        ],
        index=0
    )
system_prompt = 'You are a translator. You are translating the keyword from the source language - {} to the target language – {}.\n\nOutput JSON.'
response_format = {'type': 'json_object'}
batch = 100
attempts = 10
wait_secs = 60

col1, col2, col3 = st.columns([2, 3, 4])

with col1:
    st.write("# ☝️⬇️")
    with open('template.xlsx', 'rb') as f:
        st.download_button(
            label='下載範本',
            data=f,
            file_name='長文本工作表+關鍵字對照表（範本）.xlsx',
            mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )

with col2:
    st.write("# ✌️⬆️")
    uploaded_file = st.file_uploader(
        label='編輯後上傳',
        type=['xlsx', 'xls'],
        help='建議先試較短的文本喔😊'
    )

with col3:
    st.write("# 👌⬇️")
    if uploaded_file and 'file_name_target' not in st.session_state:
        try:
            df = pd.read_excel(uploaded_file, sheet_name='長文本工作表', dtype=str)
            kw = pd.read_excel(uploaded_file, sheet_name='關鍵字對照表', dtype=str)
        except:
            st.write("❌ 請確認您上傳的Excel檔有「長文本工作表」及「關鍵字對照表」")
            st.stop()
        try:
            source_column, target_column = df.columns
            keyword_column, translation_column = kw.columns
        except:
            st.write("❌ 請確認您上傳的Excel檔有欄位名稱")
            st.stop()
        if df[source_column].isna().any() or kw[keyword_column].isna().any():
            st.write("❌ 請確認您上傳的Excel檔沒有空行")
            st.stop()

        system_prompt = system_prompt.format(source_column, target_column)
        for i in kw.index[kw[translation_column].isna()]:
            response = call_ai(kw[keyword_column][i], model, system_prompt, response_format, api_key)
            response_dict = json.loads(response.choices[0].message.content)
            if len(response_dict.values()) == 1:
                kw.loc[i, translation_column] = list(response_dict.values())[0]
        st.write(kw)
        st.write('若關鍵字翻譯結果不理想，請自訂後重新上傳，或直接重新上傳讓AI再試一次🤞')
        st.write('若關鍵字對照表沒有問題，請按下按鈕，開始翻譯長文本工作表👇')

        if st.button('開始翻譯'):
            reference = '\n'.join([f'{keyword} | {translation}' for keyword, translation in zip(kw[keyword_column], kw[translation_column])])
            system_prompt = system_prompt.replace('the keyword', 'each line')
            system_prompt += f'\n\nMust refer to the translation of the keywords listed below.\n{reference}'
            print(system_prompt)
            for start in range(0, len(df), batch):
                series = df[source_column][start:start+batch]
                request_string = '\n'.join([f'{index}\t{value}' for index, value in zip(series.index, series)])
                for _ in range(attempts):
                    st.write(f'⏳ 第 {start+1}~{start+batch if start+batch < len(df) else len(df)} 行翻譯中...')
                    try:
                        response = call_ai(request_string, model, system_prompt, response_format, api_key)
                        response_dict = json.loads(response.choices[0].message.content)
                        response_series = pd.Series(response_dict)
                        response_series.index = response_series.index.astype(int)
                        assert len(response_series) == len(series)
                        df[target_column] = df[target_column].combine_first(response_series)
                        break
                    except:
                        time.sleep(wait_secs)
                        st.write('❌ AI出錯了！正在重跑一次...')
                else:
                    st.write(f'❌ {attempts}次翻譯失敗，請聯絡AIValue！')
                    st.stop()
            # st.write(df)
            st.write('✨ 翻譯完成！')

            file_name_target = uploaded_file.name.replace('.xlsx', f'_{target_column}.xlsx')
            with pd.ExcelWriter(file_name_target, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='長文本工作表', index=False)
                kw.to_excel(writer, sheet_name='關鍵字對照表', index=False)
            st.session_state['file_name_target'] = file_name_target

    if 'file_name_target' in st.session_state:
        with open(st.session_state['file_name_target'], 'rb') as f:
            st.download_button(
                label='下載結果',
                data=f,
                file_name=st.session_state['file_name_target'],
                mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
            )