plex-server-scripts/media-duplicates

227 lines
8.8 KiB
Python
Executable file

#!/usr/bin/python3
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Tuple
import argparse
import json
import os
import random
import requests
import shlex
import sqlite3
import subprocess
import sys
def ffprobe_file_path(file_path: str):
result = subprocess.run(
["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", "-show_streams", file_path],
capture_output=True,
)
result.check_returncode()
return json.loads(result.stdout)
def get_first_video_stream(ffprobe_info):
for stream in ffprobe_info["streams"]:
if stream["codec_type"] == "video":
return stream
raise Exception("there is no video stream")
def codec_name_is_h265(codec_name: str) -> bool:
return codec_name.find("265") or codec_name.find("HEVC") or codec_name.find("hevc")
# returns less than 0 if first is worse than second
# returns greater than 0 if first is better than second
# returns 0 if they are roughly the same
def compare_ffprobe_output(first, second) -> int:
first_stream = get_first_video_stream(first)
second_stream = get_first_video_stream(second)
# whichever has the most surface area, aka, highest resolution wins by default
first_area = first_stream["width"] * first_stream["height"]
second_area = second_stream["width"] * second_stream["height"]
if first_area > second_area:
return 1
if second_area > first_area:
return -1
# if resolutions are the same, prefer h265
first_is_h265 = codec_name_is_h265(first_stream["codec_name"])
second_is_h265 = codec_name_is_h265(second_stream["codec_name"])
if first_is_h265:
if not second_is_h265:
return 1
elif second_is_h265:
return -1
# if both are h265 or both are not h265, use bitrate
first_bitrate = first["format"]["bit_rate"]
second_bitrate = second["format"]["bit_rate"]
if first_bitrate > second_bitrate:
return 1
if second_bitrate > first_bitrate:
return -1
# at this point, I consider these files to be the same
return 0
def compare_radarr(radarr_db_path1: str, radarr_db_path2: str) -> List[Tuple[str, str]]:
result = []
with sqlite3.connect(":memory:", uri=True) as db:
db.execute("ATTACH DATABASE ? AS ?", (f"file:{radarr_db_path1}?mode=ro", "db1"))
db.execute("ATTACH DATABASE ? AS ?", (f"file:{radarr_db_path2}?mode=ro", "db2"))
rows = db.execute("""
SELECT m1.Path || '/' || mf1.RelativePath, m2.Path || '/' || mf2.RelativePath
FROM db1.Movies AS m1
JOIN db2.Movies AS m2 ON m1.ImdbId = m2.ImdbId
JOIN db1.MovieFiles AS mf1 ON m1.MovieFileId = mf1.Id
JOIN db2.MovieFiles AS mf2 ON m2.MovieFileId = mf2.Id
""").fetchall()
for path1, path2 in rows:
result.append((path1, path2))
return result
def compare_sonarr(sonarr_db_path1: str, sonarr_db_path2: str) -> List[Tuple[str, str]]:
result = []
with sqlite3.connect(":memory:", uri=True) as db:
db.execute("ATTACH DATABASE ? AS ?", (f"file:{sonarr_db_path1}?mode=ro", "db1"))
db.execute("ATTACH DATABASE ? AS ?", (f"file:{sonarr_db_path2}?mode=ro", "db2"))
rows = db.execute("""
SELECT s1.Path || '/' || ef1.RelativePath, s2.Path || '/' || ef2.RelativePath
FROM db1.Series AS s1
JOIN db2.Series AS s2 ON s1.ImdbId = s2.ImdbId
JOIN db1.Episodes AS e1 ON e1.SeriesId = s1.Id
JOIN db2.Episodes AS e2 ON e2.SeriesId = s2.Id AND e2.SeasonNumber = e1.SeasonNumber AND e2.EpisodeNumber = e1.EpisodeNumber
JOIN db1.EpisodeFiles AS ef1 ON e1.EpisodeFileId = ef1.Id
JOIN db2.EpisodeFiles AS ef2 ON e2.EpisodeFileId = ef2.Id
""").fetchall()
for path1, path2 in rows:
result.append((path1, path2))
return result
def rewrite_path(path: str, rewrites: Dict[str, str]) -> str:
if rewrites:
for orig, rewrite in rewrites.items():
if path.startswith(orig):
return path.replace(orig, rewrite, 1)
return path
def rejigger_rewrite_paths(rewrite_paths: List[str]) -> Dict[str, str]:
result = {}
for rewrite_path in rewrite_paths:
orig, rewrite = rewrite_path.split(sep=':', maxsplit=1)
result[orig] = rewrite
return result
def compare_media_file_paths(path1: str, path2: str):
try:
stat1 = os.lstat(path1)
stat2 = os.lstat(path2)
if stat1.st_ino == stat2.st_ino and stat1.st_dev == stat2.st_dev:
return None
except Exception as e:
print(f"failed to check inodes for {path1} and {path2}: {e}", file=sys.stderr)
return None
try:
return compare_ffprobe_output(ffprobe_file_path(path1), ffprobe_file_path(path2))
except Exception as e:
print(f"failed to compare ffprobe output for {path1} and {path2}: {e}", file=sys.stderr)
return None
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="radarr & sonarr duplicate checker across two database files",
)
parser.add_argument('--sonarr-db-paths', type=argparse.FileType('r'), nargs=2, metavar=("/home/sonarr/sonarr.db", "/home/othersonarr/sonarr.db"), required=True)
parser.add_argument('--radarr-db-paths', type=argparse.FileType('r'), nargs=2, metavar=("/home/radarr/radarr.db", "/home/otherradarr/radarr.db"), required=True)
parser.add_argument('--path-prefix-rewrite', type=str, nargs='+', metavar=("/media:/mnt/media", "/data:/real/path/data"), help="rewrite the paths for internal database media paths to the 'real' destinations")
parser.add_argument('--command-output-file', type=argparse.FileType('w'), nargs='?', default=sys.stdout)
parser.add_argument('--slack-webhook-url', type=str, nargs='?', default=None)
args = parser.parse_args()
for file_descriptor in args.sonarr_db_paths + args.radarr_db_paths:
file_descriptor.close()
duplicate_paths = compare_sonarr(args.sonarr_db_paths[0].name, args.sonarr_db_paths[1].name) + compare_radarr(args.radarr_db_paths[0].name, args.radarr_db_paths[1].name)
if args.path_prefix_rewrite:
rewrite_paths = rejigger_rewrite_paths(args.path_prefix_rewrite)
duplicate_paths = [(rewrite_path(path1, rewrite_paths), rewrite_path(path2, rewrite_paths)) for path1, path2 in duplicate_paths]
already_saved_space_in_bytes = 0
space_savings_in_bytes = 0
files_to_link = 0
files_already_linked = 0
# createa a process pool for NumCPUs * 2
# this is really just used for `ffprobe` concurrent calls
with ThreadPoolExecutor(max_workers=len(os.sched_getaffinity(0))*2) as executor:
def map_helper(args):
return compare_media_file_paths(args[0], args[1])
for (path1, path2), probe_result in zip(duplicate_paths, executor.map(map_helper, duplicate_paths)):
if probe_result is None:
files_already_linked += 1
already_saved_space_in_bytes += os.lstat(path1).st_size
continue
if probe_result == 0:
# if the results seem equal, let us flip a figurative coin for which path to overwrite
print("# this is the result of a coin flip", file=args.command_output_file)
path1, path2 = random.sample([path1, path2], k=2)
elif probe_result < 0:
# path2 is better than path1
path1, path2 = path2, path1
file_size_in_bytes = os.lstat(path2).st_size
space_savings_in_bytes += file_size_in_bytes
files_to_link += 1
print(f"# space savings of {file_size_in_bytes/(1024**3):.2f} GiB", file=args.command_output_file)
print(f"rm -f {shlex.quote(path2)}", file=args.command_output_file)
# replace path2's file name with path1's file name
path2 = os.path.join(os.path.dirname(path2), os.path.basename(path1))
print(f"ln -f {shlex.quote(path1)} {shlex.quote(path2)}", file=args.command_output_file)
if args.slack_webhook_url:
message = "nothing to link for file deduplication"
if files_to_link > 0:
message = f"HOLY SMOKES! We can save {space_savings_in_bytes/(1024**3):.2f} GiB by hard linking {files_to_link} files"
message += f"\nWe already save {already_saved_space_in_bytes/(1024**3):.2f} GiB by hard linking {files_already_linked} files"
requests.post(
args.slack_webhook_url,
headers={
'Content-type': 'application/json',
},
data=json.dumps({
"username": "Apollo Media Duplicates",
"icon_emoji": ":floppy_disk:",
"text": message,
}),
).raise_for_status()