#!/usr/bin/python3 from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Tuple import argparse import json import os import random import requests import shlex import sqlite3 import subprocess import sys def ffprobe_file_path(file_path: str): result = subprocess.run( ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", "-show_streams", file_path], capture_output=True, ) result.check_returncode() return json.loads(result.stdout) def get_first_video_stream(ffprobe_info): for stream in ffprobe_info["streams"]: if stream["codec_type"] == "video": return stream raise Exception("there is no video stream") def codec_name_is_h265(codec_name: str) -> bool: return codec_name.find("265") or codec_name.find("HEVC") or codec_name.find("hevc") # returns less than 0 if first is worse than second # returns greater than 0 if first is better than second # returns 0 if they are roughly the same def compare_ffprobe_output(first, second) -> int: first_stream = get_first_video_stream(first) second_stream = get_first_video_stream(second) # whichever has the most surface area, aka, highest resolution wins by default first_area = first_stream["width"] * first_stream["height"] second_area = second_stream["width"] * second_stream["height"] if first_area > second_area: return 1 if second_area > first_area: return -1 # if resolutions are the same, prefer h265 first_is_h265 = codec_name_is_h265(first_stream["codec_name"]) second_is_h265 = codec_name_is_h265(second_stream["codec_name"]) if first_is_h265: if not second_is_h265: return 1 elif second_is_h265: return -1 # if both are h265 or both are not h265, use bitrate first_bitrate = first["format"]["bit_rate"] second_bitrate = second["format"]["bit_rate"] if first_bitrate > second_bitrate: return 1 if second_bitrate > first_bitrate: return -1 # at this point, I consider these files to be the same return 0 def compare_radarr(radarr_db_path1: str, radarr_db_path2: str) -> List[Tuple[str, str]]: result = [] with sqlite3.connect(":memory:", uri=True) as db: db.execute("ATTACH DATABASE ? AS ?", (f"file:{radarr_db_path1}?mode=ro", "db1")) db.execute("ATTACH DATABASE ? AS ?", (f"file:{radarr_db_path2}?mode=ro", "db2")) rows = db.execute(""" SELECT m1.Path || '/' || mf1.RelativePath, m2.Path || '/' || mf2.RelativePath FROM db1.Movies AS m1 JOIN db2.Movies AS m2 ON m1.ImdbId = m2.ImdbId JOIN db1.MovieFiles AS mf1 ON m1.MovieFileId = mf1.Id JOIN db2.MovieFiles AS mf2 ON m2.MovieFileId = mf2.Id """).fetchall() for path1, path2 in rows: result.append((path1, path2)) return result def compare_sonarr(sonarr_db_path1: str, sonarr_db_path2: str) -> List[Tuple[str, str]]: result = [] with sqlite3.connect(":memory:", uri=True) as db: db.execute("ATTACH DATABASE ? AS ?", (f"file:{sonarr_db_path1}?mode=ro", "db1")) db.execute("ATTACH DATABASE ? AS ?", (f"file:{sonarr_db_path2}?mode=ro", "db2")) rows = db.execute(""" SELECT s1.Path || '/' || ef1.RelativePath, s2.Path || '/' || ef2.RelativePath FROM db1.Series AS s1 JOIN db2.Series AS s2 ON s1.ImdbId = s2.ImdbId JOIN db1.Episodes AS e1 ON e1.SeriesId = s1.Id JOIN db2.Episodes AS e2 ON e2.SeriesId = s2.Id AND e2.SeasonNumber = e1.SeasonNumber AND e2.EpisodeNumber = e1.EpisodeNumber JOIN db1.EpisodeFiles AS ef1 ON e1.EpisodeFileId = ef1.Id JOIN db2.EpisodeFiles AS ef2 ON e2.EpisodeFileId = ef2.Id """).fetchall() for path1, path2 in rows: result.append((path1, path2)) return result def rewrite_path(path: str, rewrites: Dict[str, str]) -> str: if rewrites: for orig, rewrite in rewrites.items(): if path.startswith(orig): return path.replace(orig, rewrite, 1) return path def rejigger_rewrite_paths(rewrite_paths: List[str]) -> Dict[str, str]: result = {} for rewrite_path in rewrite_paths: orig, rewrite = rewrite_path.split(sep=':', maxsplit=1) result[orig] = rewrite return result def compare_media_file_paths(path1: str, path2: str): try: stat1 = os.lstat(path1) stat2 = os.lstat(path2) if stat1.st_ino == stat2.st_ino and stat1.st_dev == stat2.st_dev: return None except Exception as e: print(f"failed to check inodes for {path1} and {path2}: {e}", file=sys.stderr) return None try: return compare_ffprobe_output(ffprobe_file_path(path1), ffprobe_file_path(path2)) except Exception as e: print(f"failed to compare ffprobe output for {path1} and {path2}: {e}", file=sys.stderr) return None if __name__ == '__main__': parser = argparse.ArgumentParser( description="radarr & sonarr duplicate checker across two database files", ) parser.add_argument('--sonarr-db-paths', type=argparse.FileType('r'), nargs=2, metavar=("/home/sonarr/sonarr.db", "/home/othersonarr/sonarr.db"), required=True) parser.add_argument('--radarr-db-paths', type=argparse.FileType('r'), nargs=2, metavar=("/home/radarr/radarr.db", "/home/otherradarr/radarr.db"), required=True) parser.add_argument('--path-prefix-rewrite', type=str, nargs='+', metavar=("/media:/mnt/media", "/data:/real/path/data"), help="rewrite the paths for internal database media paths to the 'real' destinations") parser.add_argument('--command-output-file', type=argparse.FileType('w'), nargs='?', default=sys.stdout) parser.add_argument('--slack-webhook-url', type=str, nargs='?', default=None) args = parser.parse_args() for file_descriptor in args.sonarr_db_paths + args.radarr_db_paths: file_descriptor.close() duplicate_paths = compare_sonarr(args.sonarr_db_paths[0].name, args.sonarr_db_paths[1].name) + compare_radarr(args.radarr_db_paths[0].name, args.radarr_db_paths[1].name) if args.path_prefix_rewrite: rewrite_paths = rejigger_rewrite_paths(args.path_prefix_rewrite) duplicate_paths = [(rewrite_path(path1, rewrite_paths), rewrite_path(path2, rewrite_paths)) for path1, path2 in duplicate_paths] already_saved_space_in_bytes = 0 space_savings_in_bytes = 0 files_to_link = 0 files_already_linked = 0 # createa a process pool for NumCPUs * 2 # this is really just used for `ffprobe` concurrent calls with ThreadPoolExecutor(max_workers=len(os.sched_getaffinity(0))*2) as executor: def map_helper(args): return compare_media_file_paths(args[0], args[1]) for (path1, path2), probe_result in zip(duplicate_paths, executor.map(map_helper, duplicate_paths)): if probe_result is None: files_already_linked += 1 already_saved_space_in_bytes += os.lstat(path1).st_size continue if probe_result == 0: # if the results seem equal, let us flip a figurative coin for which path to overwrite print("# this is the result of a coin flip", file=args.command_output_file) path1, path2 = random.sample([path1, path2], k=2) elif probe_result < 0: # path2 is better than path1 path1, path2 = path2, path1 file_size_in_bytes = os.lstat(path2).st_size space_savings_in_bytes += file_size_in_bytes files_to_link += 1 print(f"# space savings of {file_size_in_bytes/(1024**3):.2f} GiB", file=args.command_output_file) print(f"rm -f {shlex.quote(path2)}", file=args.command_output_file) # replace path2's file name with path1's file name path2 = os.path.join(os.path.dirname(path2), os.path.basename(path1)) print(f"ln -f {shlex.quote(path1)} {shlex.quote(path2)}", file=args.command_output_file) if args.slack_webhook_url: message = "nothing to link for file deduplication" if files_to_link > 0: message = f"HOLY SMOKES! We can save {space_savings_in_bytes/(1024**3):.2f} GiB by hard linking {files_to_link} files" message += f"\nWe already save {already_saved_space_in_bytes/(1024**3):.2f} GiB by hard linking {files_already_linked} files" requests.post( args.slack_webhook_url, headers={ 'Content-type': 'application/json', }, data=json.dumps({ "username": "Apollo Media Duplicates", "icon_emoji": ":floppy_disk:", "text": message, }), ).raise_for_status()