From 58b7437ef81e817aa48b1b622054f6315b247dbf Mon Sep 17 00:00:00 2001 From: Tordarus Date: Fri, 6 Jun 2025 16:35:29 +0200 Subject: [PATCH] initial commit --- analyze_file.go | 52 ++++++++++++++++ errors.go | 11 ++++ go.mod | 11 ++++ go.sum | 8 +++ lang_codes.go | 148 ++++++++++++++++++++++++++++++++++++++++++++++ parse_file.go | 56 ++++++++++++++++++ parse_language.go | 11 ++++ parse_torrent.go | 93 +++++++++++++++++++++++++++++ parsers.go | 128 +++++++++++++++++++++++++++++++++++++++ utils.go | 22 +++++++ 10 files changed, 540 insertions(+) create mode 100644 analyze_file.go create mode 100644 errors.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 lang_codes.go create mode 100644 parse_file.go create mode 100644 parse_language.go create mode 100644 parse_torrent.go create mode 100644 parsers.go create mode 100644 utils.go diff --git a/analyze_file.go b/analyze_file.go new file mode 100644 index 0000000..639f781 --- /dev/null +++ b/analyze_file.go @@ -0,0 +1,52 @@ +package parsers + +import ( + "context" + "os" + + "git.tordarus.net/nyaanime/model" + "gopkg.in/vansante/go-ffprobe.v2" +) + +// TODO cache +func AnalyzeFile(path string) (*model.ParsedFile, error) { + props := &model.ParsedFile{File: path} + + file, err := os.Open(path) + if err != nil { + return nil, err + } + defer file.Close() + + data, err := ffprobe.ProbeReader(context.Background(), file) + if err != nil { + return nil, err + } + + defaultVideoLang := "" + for _, s := range data.StreamType(ffprobe.StreamVideo) { + if s.Disposition.Default > 0 { + props.Resolution = model.Resolution(s.Height) + defaultVideoLang = ParseLanguage(s.Tags.Language) + break + } + } + + for _, s := range data.StreamType(ffprobe.StreamAudio) { + if s.Tags.Language != "" { + props.Languages = append(props.Languages, ParseLanguage(s.Tags.Language)) + } else if s.Disposition.Default > 0 { + props.Languages = append(props.Languages, defaultVideoLang) + } + } + + for _, s := range data.StreamType(ffprobe.StreamSubtitle) { + if s.Tags.Language != "" { + props.Subtitles = append(props.Subtitles, ParseLanguage(s.Tags.Language)) + } else if s.Disposition.Default > 0 { + props.Subtitles = append(props.Subtitles, defaultVideoLang) + } + } + + return props, nil +} diff --git a/errors.go b/errors.go new file mode 100644 index 0000000..8af1f88 --- /dev/null +++ b/errors.go @@ -0,0 +1,11 @@ +package parsers + +import "git.tordarus.net/tordarus/adverr/v2" + +var ( + ErrTorrentParserInsufficientData = adverr.NewErrTmpl("ErrTorrentParserInsufficientData", "regex '%s' must at least provide title and episode") + ErrTorrentParserInsufficientLanguageData = adverr.NewErrTmpl("ErrTorrentParserInsufficientLanguageData", "no language reference in regex and no default language set") + ErrTorrentParserInsufficientSubtitleData = adverr.NewErrTmpl("ErrTorrentParserInsufficientSubtitleData", "no subtitle reference in regex and no default subtitle set") + ErrTorrentParserInsufficientResolutionData = adverr.NewErrTmpl("ErrTorrentParserInsufficientResolutionData", "no resolution reference in regex and no default resolution set") + ErrTorrentParserInvalidGroupReference = adverr.NewErrTmpl("ErrTorrentParserInvalidGroupReference", "options references group %d but regex only has %d groups") +) diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..e9497f6 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module git.tordarus.net/tordarus/parsers + +go 1.18 + +require ( + git.tordarus.net/nyaanime/model v0.0.1 + git.tordarus.net/tordarus/adverr/v2 v2.0.2 + gopkg.in/vansante/go-ffprobe.v2 v2.2.1 +) + +require git.tordarus.net/tordarus/anilist v1.5.2 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..205a139 --- /dev/null +++ b/go.sum @@ -0,0 +1,8 @@ +git.tordarus.net/nyaanime/model v0.0.1 h1:/I+87Z6eEw/o2adltKnCk4FZai2mPekjYlzEjY1ppyQ= +git.tordarus.net/nyaanime/model v0.0.1/go.mod h1:oHV82UMNy4XgPHkI6tZiwabdi6myqHXgjMi9sNZ+rG4= +git.tordarus.net/tordarus/adverr/v2 v2.0.2 h1:7nvNjMMjtGPq0EY6duMiv+seJ7MacNvKSBmckHl6Erg= +git.tordarus.net/tordarus/adverr/v2 v2.0.2/go.mod h1:gCC46KsWosZJh7MVNDEU99hKQoxEWZgHITDHtmFwwiQ= +git.tordarus.net/tordarus/anilist v1.5.2 h1:SxlovS+e3lgL2SowQQwj8dQrIZzRFPomcGCw3V+My0Q= +git.tordarus.net/tordarus/anilist v1.5.2/go.mod h1:Mrhx/9+8HJVj5ebQ5fJuXqL220tEJhgQIqFK2WKPXgA= +gopkg.in/vansante/go-ffprobe.v2 v2.2.1 h1:sFV08OT1eZ1yroLCZVClIVd9YySgCh9eGjBWO0oRayI= +gopkg.in/vansante/go-ffprobe.v2 v2.2.1/go.mod h1:qF0AlAjk7Nqzqf3y333Ly+KxN3cKF2JqA3JT5ZheUGE= diff --git a/lang_codes.go b/lang_codes.go new file mode 100644 index 0000000..7b6bb7a --- /dev/null +++ b/lang_codes.go @@ -0,0 +1,148 @@ +package parsers + +import "strings" + +// langSynonyms converts all irregular lang codes to ISO 639-1 +var langSynonyms = map[string]string{ + // english + "eng": "en", // Erai-Raws | ffprobe + "us": "en", // Erai-Raws + + // portuguese + "por-br": "pt", // Erai-Raws + "por": "pt", // Erai-Raws | ffprobe + "br": "pt", // Erai-Raws + + // spanish + "spa-la": "es", // Erai-Raws + "spa": "es", // Erai-Raws | ffprobe + "mx": "es", // Erai-Raws + + // arabic + "ara": "ar", // Erai-Raws | ffprobe + "sa": "ar", // Erai-Raws + + // french + "fre": "fr", // Erai-Raws | ffprobe + + // german + "ger": "de", // Erai-Raws | ffprobe + + // italian + "ita": "it", // Erai-Raws | ffprobe + + // finnish + "fin": "fi", // ffprobe | ffprobe + + // russian + "rus": "ru", // Erai-Raws + + // japanese + "jpn": "ja", // Erai-Raws | ffprobe + "jp": "ja", // Erai-Raws + + // polish + "pol": "pl", // Erai-Raws | ffprobe + + // dutch + "dut": "nl", // Erai-Raws | ffprobe + + // norwegian + "nob": "no", // Erai-Raws | ffprobe + + // turkish + "tur": "tr", // Erai-Raws | ffprobe + + // swedish + "swe": "sv", // Erai-Raws | ffprobe + "se": "sv", // Erai-Raws + + // greek + "gre": "el", // Erai-Raws | ffprobe + "gr": "el", // Erai-Raws + + // hebrew + "heb": "he", // Erai-Raws | ffprobe + "il": "he", // Erai-Raws + + // romanian + "rum": "ro", // Erai-Raws + "rom": "ro", // ffprobe + + // indonesian + "ind": "id", // Erai-Raws + + // thai + "tha": "th", // Erai-Raws | ffprobe + + // korean + "kor": "ko", // Erai-Raws | ffprobe + "kr": "ko", // Erai-Raws + + // danish + "dan": "da", // Erai-Raws | ffprobe + "dk": "da", // Erai-Raws + + // chinese (simplified & traditional) + "chi": "zh", // Erai-Raws | ffprobe + "cn": "zh", // Erai-Raws + + // bulgarian + "bul": "bg", // Erai-Raws | ffprobe + + // vietnamese + "vie": "vi", // Erai-Raws + "vn": "vi", // Erai-Raws + + // hindi + "hin": "hi", // Erai-Raws + "in": "hi", // Erai-Raws + + // tamil + "tel": "ta", // Erai-Raws + "lk": "ta", // Erai-Raws + + // ukrainian + "ukr": "uk", // Erai-Raws + "ua": "uk", // Erai-Raws + + // hungarian + "hun": "hu", // Erai-Raws + + // czech + "ces": "cs", // Erai-Raws + "cz": "cs", // Erai-Raws + + // croatian + "hrv": "hr", // Erai-Raws + + // malaysian + "may": "ms", // Erai-Raws + "my": "ms", // Erai-Raws + + // slovakian + "slk": "sk", // Erai-Raws + + // filipino + "fil": "tl", // Erai-Raws + "ph": "tl", // Erai-Raws +} + +// ParseLanguage converts irregular language codes to ISO 639-1 +func ParseLanguage(str string) string { + if code, ok := langSynonyms[strings.ToLower(str)]; ok { + return code + } + + return strings.ToLower(str) +} + +// ParseLanguages converts multiple irregular language codes to ISO 639-1. +// It simply calls ParseLanguage for each language code +func ParseLanguages(langCodes []string) []string { + codes := make([]string, 0, len(langCodes)) + for _, irregularLangCode := range langCodes { + codes = append(codes, ParseLanguage(irregularLangCode)) + } + return codes +} diff --git a/parse_file.go b/parse_file.go new file mode 100644 index 0000000..afc08f1 --- /dev/null +++ b/parse_file.go @@ -0,0 +1,56 @@ +package parsers + +import ( + "path/filepath" + "regexp" + + "git.tordarus.net/nyaanime/model" +) + +// FileParseOptions holds the subgroup index in which information can be found in a given regex +type FileParseOptions struct { + Name int + Episode int +} + +func RegexFileParser(regex string, options FileParseOptions) model.FileParserFunc { + pattern := regexp.MustCompile(regex) + + // handle faulty regexes + if options.Name == 0 || options.Episode == 0 { + panic(ErrTorrentParserInsufficientData.New(regex)) + } + + // handle faulty group references + for _, g := range []int{options.Name, options.Episode} { + if g > pattern.NumSubexp() { + panic(ErrTorrentParserInvalidGroupReference.New(g, pattern.NumSubexp())) + } + } + + return func(parser *model.Parser, path string) (file *model.ParsedFile, ok bool) { + filename := filepath.Base(path) + matches := pattern.FindStringSubmatch(filename) + + if matches == nil { + return nil, false + } + + episode, ok := atoi(matches[options.Episode]) + if !ok { + return nil, false + } + + parsedFile, err := AnalyzeFile(path) + if err != nil { + return nil, false + } + + parsedFile.OriginalAnimeTitle = matches[options.Name] + parsedFile.Episode = episode + parsedFile.Parser = parser + parsedFile.File = path + + return parsedFile, true + } +} diff --git a/parse_language.go b/parse_language.go new file mode 100644 index 0000000..1ecdbcb --- /dev/null +++ b/parse_language.go @@ -0,0 +1,11 @@ +package parsers + +import "regexp" + +type LanguageParserFunc func(str string) []string + +var SquareBracketsLanguageParserRegex = regexp.MustCompile(`\[.+?\]`) + +func SquareBracketsLanguageParser(str string) []string { + return TrimPrefixSuffix(SquareBracketsLanguageParserRegex.FindAllString(str, -1), "[", "]") +} diff --git a/parse_torrent.go b/parse_torrent.go new file mode 100644 index 0000000..49f1e65 --- /dev/null +++ b/parse_torrent.go @@ -0,0 +1,93 @@ +package parsers + +import ( + "regexp" + + "git.tordarus.net/nyaanime/model" +) + +// TorrentParseOptions holds the subgroup index in which information can be found in a given regex +// as well as some parser specific functions +type TorrentParseOptions struct { + // regex group references + Name int + Episode int + Languages int + Subtitles int + Resolution int + + // language parsers + LanguageParser LanguageParserFunc + SubtitleParser LanguageParserFunc + + // default values used when group reference is 0 + DefaultLanguages []string + DefaultSubtitles []string + DefaultResolution model.Resolution +} + +func RegexTorrentParser(regex string, options TorrentParseOptions) model.TorrentParserFunc { + pattern := regexp.MustCompile(regex) + + // handle faulty regexes + if options.Name == 0 || options.Episode == 0 { + panic(ErrTorrentParserInsufficientData.New(regex)) + } else if options.Languages == 0 && options.DefaultLanguages == nil { + panic(ErrTorrentParserInsufficientLanguageData.New(regex)) + } else if options.Subtitles == 0 && options.DefaultSubtitles == nil { + panic(ErrTorrentParserInsufficientSubtitleData.New(regex)) + } else if options.Resolution == 0 && options.DefaultResolution == 0 { + panic(ErrTorrentParserInsufficientResolutionData.New(regex)) + } + + // handle faulty group references + for _, g := range []int{options.Name, options.Episode, options.Languages, options.Subtitles, options.Resolution} { + if g > pattern.NumSubexp() { + panic(ErrTorrentParserInvalidGroupReference.New(g, pattern.NumSubexp())) + } + } + + return func(parser *model.Parser, torrent *model.Torrent) (ParsedTorrent *model.ParsedTorrent, ok bool) { + var err error + + matches := pattern.FindStringSubmatch(torrent.Title) + + if matches == nil { + return nil, false + } + + episode, ok := atoi(matches[options.Episode]) + if !ok { + return nil, false + } + + resolution := options.DefaultResolution + if options.Resolution != 0 { + resolution, err = model.ParseResolution(matches[options.Resolution]) + if err != nil { + return nil, false + } + } + + languages := options.DefaultLanguages + if options.Languages != 0 { + languages = options.LanguageParser(matches[options.Languages]) + } + + subtitles := options.DefaultSubtitles + if options.Subtitles != 0 { + subtitles = options.SubtitleParser(matches[options.Subtitles]) + } + + return &model.ParsedTorrent{ + OriginalAnimeTitle: matches[options.Name], + Episode: episode, + Resolution: resolution, + Parser: parser, + Languages: ParseLanguages(languages), + Subtitles: ParseLanguages(subtitles), + + Torrent: torrent, + }, true + } +} diff --git a/parsers.go b/parsers.go new file mode 100644 index 0000000..39ddd62 --- /dev/null +++ b/parsers.go @@ -0,0 +1,128 @@ +package parsers + +import "git.tordarus.net/nyaanime/model" + +/* +how to get all torrent names on a nyaa page: + +let s = ""; +document.querySelectorAll("tr > td:nth-child(2) > a:not(.comments)").forEach(element => { + s += element.textContent + "\n"; +}) +console.log(s); + +*/ + +var Parsers = []model.Parser{ + { + Identity: "Erai-Raws", + TorrentParser: RegexTorrentParser( + `^\[Erai-raws\] (.*) - (.*?) (?:END )?(?:\[v\d+\])?\[(.*?)p\](?:\[HEVC\])?(?:\[Multiple Subtitle\])?(?:\s(\[.*?\]+)?|\[[A-Z0-9]{8}\]\.mkv)$`, + TorrentParseOptions{ + Name: 1, + Episode: 2, + Resolution: 3, + Subtitles: 4, + SubtitleParser: SquareBracketsLanguageParser, + DefaultLanguages: []string{"ja"}, + }, + ), + FileParser: RegexFileParser( + `^\[Erai-raws\] (.*?) - (\d+?) .*?\.mkv$`, + FileParseOptions{ + Name: 1, + Episode: 2, + }, + ), + }, + + { + Identity: "SubsPlease", + TorrentParser: RegexTorrentParser( + `^\[SubsPlease\] (.*) - (\d+?) \((.*?)\) \[.*?\].mkv$`, + TorrentParseOptions{ + Name: 1, + Episode: 2, + Resolution: 3, + DefaultLanguages: []string{"ja"}, + DefaultSubtitles: []string{"en"}, + }, + ), + FileParser: RegexFileParser( + `^\[SubsPlease\] (.*?) - (\d+?) .*?\.mkv$`, + FileParseOptions{ + Name: 1, + Episode: 2, + }, + ), + }, + + { + Identity: "PuyaSubs!", + TorrentParser: RegexTorrentParser( + `^\[PuyaSubs!\] (.*) - (\d+?) \[ESP-ENG\]\[(.*?)\]\[.*?\]\.mkv$`, + TorrentParseOptions{ + Name: 1, + Episode: 2, + Resolution: 3, + DefaultLanguages: []string{"ja"}, + DefaultSubtitles: []string{"en"}, + }, + ), + FileParser: RegexFileParser( + `^\[PuyaSubs!\] (.*?) - (\d+?) .*?\.mkv$`, + FileParseOptions{ + Name: 1, + Episode: 2, + }, + ), + + // tag 0th audio stream as japanese language and copy all other streams unchanged into output file + FileEncoding: "-map 0 -c:v copy -c:a copy -c:s copy -metadata:s:a:0 language=jpn", + }, + + { + Identity: "NanakoRaws-JP", + TorrentParser: RegexTorrentParser( + `^\[NanakoRaws\] (.*?) - (\d+?)(?:v\d+)?(?: END)? \((.*?)p\)(?: \(.*?\))?\.mkv \(include JPsub.*?\)$`, + TorrentParseOptions{ + Name: 1, + Episode: 2, + Resolution: 3, + DefaultLanguages: []string{"ja"}, + DefaultSubtitles: []string{"ja"}, + }, + ), + FileParser: RegexFileParser( + `^\[NanakoRaws\] (.*?) - (\d+?)(?:v\d+)?(?: END)? \((.*?)p\)(?: \(.*?\))?\.mkv$`, + FileParseOptions{ + Name: 1, + Episode: 2, + }, + ), + + // tag 0th audio stream and subtitle stream as japanese language and copy all other streams unchanged into output file + FileEncoding: "-map 0 -c:v copy -c:a copy -c:s copy -metadata:s:a:0 language=jpn -metadata:s:s:0 language=jpn", + }, + + { + Identity: "Ohys-Raws", + TorrentParser: RegexTorrentParser( + `^\[Ohys-Raws\] (.*?) - (\d+?) \(.*? \d+x(\d+?) .*?\)(?: v2)?\.mp4$`, + TorrentParseOptions{ + Name: 1, + Episode: 2, + Resolution: 3, + DefaultLanguages: []string{"ja"}, + DefaultSubtitles: []string{}, + }, + ), + FileParser: RegexFileParser( + `^\[Ohys-Raws\] (.*) - (.*?) (?:END )?\(.*?\)(?: v2)?\.mp4$`, + FileParseOptions{ + Name: 1, + Episode: 2, + }, + ), + }, +} diff --git a/utils.go b/utils.go new file mode 100644 index 0000000..e60493b --- /dev/null +++ b/utils.go @@ -0,0 +1,22 @@ +package parsers + +import ( + "strconv" + "strings" +) + +func atoi(s string) (int, bool) { + v, err := strconv.Atoi(s) + if err != nil { + return 0, false + } + return v, true +} + +func TrimPrefixSuffix(arr []string, prefix, suffix string) []string { + trims := make([]string, 0, len(arr)) + for _, str := range arr { + trims = append(trims, strings.TrimSuffix(strings.TrimPrefix(str, prefix), suffix)) + } + return trims +}