Git-MRI/Git-MRI.py
2025-05-31 16:57:43 +02:00

208 lines
7.5 KiB
Python

import os
import re
import subprocess
import requests
import random
import string
from urllib.parse import urlparse
TEXT_EXTENSIONS = {'.md', '.markdown', '.txt', '.rst', '.adoc', '.html'}
IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg'}
LOG_FILENAME = "no_images.log"
def get_repo_name_from_url(repo_url):
return os.path.splitext(os.path.basename(urlparse(repo_url).path))[0]
def is_text_file(filename):
return os.path.splitext(filename)[1].lower() in TEXT_EXTENSIONS
def generate_unique_filename(extension, existing_names):
while True:
name = ''.join(random.choices(string.ascii_lowercase + string.digits, k=20))
filename = f"{name}{extension}"
if filename not in existing_names:
return filename
def mimetotype_to_extension(mime):
mapping = {
"image/jpeg": "jpg",
"image/png": "png",
"image/gif": "gif",
"image/webp": "webp",
"image/svg+xml": "svg"
}
return mapping.get(mime, "img")
def download_and_replace_images(repo_path):
images_dir = os.path.join(repo_path, "images")
os.makedirs(images_dir, exist_ok=True)
modified_files = []
logged_urls = []
downloaded_files = set()
log_path = os.path.join(repo_path, LOG_FILENAME)
pattern_md = r'(!?\[.*?\]\()(.+?)(\))'
pattern_img_tag = r'(<img[^>]*?src=["\'])(https?[^"\']+)(["\'])'
for root, _, files in os.walk(repo_path):
for file in files:
full_path = os.path.join(root, file)
if not is_text_file(file):
continue
with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
modified = False
def handle_url(url, base_path):
nonlocal modified
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
content_type = r.headers.get("Content-Type", "")
if not content_type.startswith("image/"):
reason = f"Type no-image ({content_type})"
logged_urls.append((url, reason))
print(f"⚠️ Ignored : {url}{reason}")
return None
ext = os.path.splitext(urlparse(url).path)[1]
if not ext:
ext = mimetotype_to_extension(content_type)
if not ext.startswith('.'):
ext = '.' + ext
filename = generate_unique_filename(ext, downloaded_files)
downloaded_files.add(filename)
local_path = os.path.join(images_dir, filename)
with open(local_path, "wb") as img_file:
img_file.write(r.content)
print(f"✅ Image downloaded : {filename}")
rel_path = os.path.relpath(local_path, os.path.dirname(base_path))
modified = True
return rel_path
except Exception as e:
reason = f"Error : {e}"
logged_urls.append((url, reason))
print(f"❌ Download error {url}{e}")
return None
def replace_md(match):
url = match.group(2)
if not url.startswith("http"):
return match.group(0)
new_path = handle_url(url, full_path)
return f"{match.group(1)}{new_path}{match.group(3)}" if new_path else match.group(0)
def replace_img_tag(match):
url = match.group(2)
if not url.startswith("http"):
return match.group(0)
new_path = handle_url(url, full_path)
return f"{match.group(1)}{new_path}{match.group(3)}" if new_path else match.group(0)
content = re.sub(pattern_md, replace_md, content)
content = re.sub(pattern_img_tag, replace_img_tag, content)
if modified:
with open(full_path, "w", encoding="utf-8") as f:
f.write(content)
modified_files.append(full_path)
print(f"✒️ Modified file : {full_path}")
if logged_urls:
with open(log_path, "w", encoding="utf-8") as log_file:
for url, reason in logged_urls:
log_file.write(f"{url} - {reason}\n")
print(f"📝 Log file created : {log_path}")
print(f"\n{len(modified_files)} Modified file(s).")
move_all_unused_images_to_old(repo_path)
def move_all_unused_images_to_old(repo_path):
images_dir = os.path.join(repo_path, "images")
old_dir = os.path.join(images_dir, "old")
os.makedirs(old_dir, exist_ok=True)
all_image_paths = []
for root, _, files in os.walk(repo_path):
if '.git' in root:
continue
for file in files:
ext = os.path.splitext(file)[1].lower()
if ext in IMAGE_EXTENSIONS:
full_path = os.path.join(root, file)
rel_path = os.path.relpath(full_path, repo_path)
all_image_paths.append((rel_path, full_path))
referenced = set()
for root, _, files in os.walk(repo_path):
for file in files:
if not is_text_file(file):
continue
full_path = os.path.join(root, file)
with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
for rel_path, _ in all_image_paths:
if rel_path in content or os.path.basename(rel_path) in content:
referenced.add(rel_path)
moved = 0
for rel_path, full_path in all_image_paths:
if rel_path in referenced:
continue
filename = os.path.basename(full_path)
target_path = os.path.join(old_dir, filename)
count = 1
while os.path.exists(target_path):
name, ext = os.path.splitext(filename)
target_path = os.path.join(old_dir, f"{name}_{count}{ext}")
count += 1
os.rename(full_path, target_path)
print(f"↴ Image moved : {rel_path} → images/old/{os.path.basename(target_path)}")
moved += 1
print(f"\n{moved} unreferenced image(s) moved in images/old/")
def main():
print("")
print("▄▖▘▗ ▖ ▖▄▖▄▖")
print("▌ ▌▜▘▄▖▛▖▞▌▙▘▐ ")
print("▙▌▌▐▖ ▌▝ ▌▌▌▟▖")
print("")
print("Ce programme python récupère un dépot git avec toute les images et les enregistres localement en mettant à jour les liens vers celles-ci.")
print("License: CC BY-NC-SA 4.0")
print("")
repo_url = input("➤ Enter the URL of the GitHub repository to migrate (*.git) : ").strip()
destination = input("➤ Enter the absolute path of the destination folder : ").strip()
if not os.path.isdir(destination):
print("❌ The specified folder does not exist.")
return
repo_name = get_repo_name_from_url(repo_url)
repo_path = os.path.join(destination, repo_name)
if os.path.exists(repo_path):
print(f"⚠️ The '{repo_path}' folder already exists. Delete it or choose another location.")
return
print(f"📥 Cloning the repository in : {repo_path}")
subprocess.run(["git", "clone", repo_url, repo_path], check=True)
download_and_replace_images(repo_path)
print(f"\n📦 Deposit ready in : {repo_path}")
if __name__ == "__main__":
main()