import re
import html

def clean_subtitle_text(text):
    # Unescape HTML entities like \u003cbr />
    text = text.replace('\\u003cbr />', '\n')
    text = html.unescape(text)
    # Remove <font ...> tags
    text = re.sub(r"</?font[^>]*>", "", text)
    return text.strip()

input_file = "CB_s01e20.srt"
output_file = "translated_clean.srt"

with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        if re.match(r"^\d+$", line) or "-->" in line or line.strip() == "":
            outfile.write(line)
        else:
            clean_line = clean_subtitle_text(line)
            outfile.write(clean_line + "\n")
