Parallelize and fix more bugs

author: Dylan Jones <dylanjones2011@gmail.com> 2020-11-25 22:30:20 -0500
committer: Dylan Jones <dylanjones2011@gmail.com> 2020-11-25 22:30:20 -0500
commit: 2c29fbad324c826e118be42e510a8388e4648ef5 (patch)
tree: 2255f83ccb47235cc64cf0b319eb1a5439549e2b
parent: 407d3016088d756020662cd9e2c44089ada6aefb (diff)
download: modpackman-2c29fbad324c826e118be42e510a8388e4648ef5.tar.gz
modpackman-2c29fbad324c826e118be42e510a8388e4648ef5.zip
2 files changed, 116 insertions, 22 deletions
diff --git a/.gitignore b/.gitignore
index f14d830..e626d72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,5 @@
 pack-location.txt
+geckodriver
+geckodriver.exe
+geckodriver.log
+__pycache__/
diff --git a/update.py b/update.py
index 38af029..28ee252 100755
--- a/update.py
+++ b/update.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 import argparse
 import os
 import sys
@@ -8,6 +7,7 @@ import shutil
 import re
 import collections
 import urllib.parse
+import multiprocessing
 
 import requests
 
@@ -49,6 +49,10 @@ parser.add_argument("--game-version",
 VERSION = 0
 
 def read_file(fil):
+    """
+    Given a filename, read its contents in as a list of tuples.
+    This function strips out comment lines and whitespaces.
+    """
     strings = []
     with open(fil) as f:
         for line in f:
@@ -102,32 +106,27 @@ def apply_updates(args):
         version = tuple(int(x) for x in args.game_version.split('.'))
     else:
         version = (2, 0, 0)
-    print("Populating version File...")
+    print("Populating version file...")
     mods = read_file(args.filename)
     print("Getting new versions of all mods...")
-    ffx = firefox()
+    mod_urls = find_updated_urls([x[1] for x in mods], version)
+    print("Downloading and checksumming all mods...")
+    checksums = find_checksums(mod_urls)
+
+    # Write information out to version.txt
     with open(args.version_file, 'w') as f:
         f.write('# Format: <jarname> <hex digested sha1> <direct download url>\n')
         f.write("#VERSION " + str(VERSION + 1) + "\n")
-        for mod in mods:
-            print("Fetching {mod[0]}...".format(mod=mod))
-            if 'curseforge' in mod[1]:
-                url = find_cdn(ffx, mod[1], version)
-            else:
-                url = requests.get(mod[1]).url
-            if url is None:
-                print('[!]Failed to fetch {mod[0]}!'.format(mod=mod))
-                continue
-            resp = requests.get(url)
-            hsh = hashlib.sha1(resp.content).hexdigest()
-            f.write('{mod[0]} {hsh} {resp.url}\n'.format(mod=mod, hsh=hsh, resp=resp))
-    ffx.close()
+        for name, checksum, url in zip((mod[0] for mod in mods), checksums, mod_urls):
+            f.write(f'{name} {checksum} {url}\n')
+
     print()
     print("Done!")
     print("Updates applied to {args.version_file}".format(args=args))
     print("New pack version is " + str(VERSION + 1))
     print("[!] No mods were installed. To update your mods folder, run 'update.py install'")
 
+
 # Find if any updates are available
 def check_updates(args):
     if args.game_version is not None:
@@ -162,13 +161,83 @@ def check_updates(args):
         print("Run 'python update.py apply_updates' to create a new version with these updates applied.")
 
 
+def threaded_find_url(homepage_url, game_version):
+    """
+    Helper function that finds a single mod URL based on the homepage.
+    """
+    if 'curseforge' in homepage_url:
+        ffx = firefox()
+        final_url = find_cdn(ffx, homepage_url, game_version)
+        ffx.close()
+    else:
+        final_url = requests.get(homepage_url).url
+    return final_url
+
+
+def find_updated_urls(forge_urls, game_version, threads=20):
+    """
+    Given a list of mod homepage URLs, find all of their direct download links in parallel.
+    """
+
+    # First, check that we can successfully open a Firefox instance in the main thread.
+    # This provides us with a much nicer error message and quicker feedback.
+    f = firefox()
+    f.close()
+
+    with multiprocessing.Pool(threads) as pool:
+        # No progress indicator possible
+        # return pool.map(threaded_find_url, forge_urls)
+
+        # Much longer, but allows us to do a nice progress indicator
+        result_futures = []
+        for url in forge_urls:
+            result_futures.append(pool.apply_async(threaded_find_url, (url, game_version)))
+
+        results = []
+        for i,f in enumerate(result_futures):
+            results.append(f.get())
+            print(f'\r{i+1}/{len(result_futures)} URLs updated ({round((i+1)/len(result_futures)*100)}%)', end='')
+        print()
+
+        return results
+
+
+def threaded_calc_sha1(direct_url):
+    """
+    Helper function that downloads and calculates a single SHA1 hash from a direct download URL.
+    """
+    resp = requests.get(direct_url)
+    hsh = hashlib.sha1(resp.content).hexdigest()
+    return hsh
+
+
+def find_checksums(direct_urls, threads=8):
+    """
+    Given a list of direct download URLs, download them all and calculate the SHA1 checksum of the file at that location.
+    """
+    
+    with multiprocessing.Pool(threads) as pool:
+        # Much longer, but allows us to do a nice progress indicator
+        result_futures = []
+        for url in direct_urls:
+            result_futures.append(pool.apply_async(threaded_calc_sha1, (url,)))
+
+        results = []
+        for i,f in enumerate(result_futures):
+            results.append(f.get())
+            print(f'\r{i+1}/{len(result_futures)} checksums calculated ({round((i+1)/len(result_futures)*100)}%)', end='')
+        print()
+
+        return results
+
+
 def find_cdn(ffx, url, version):
     """
     Given a mod home URL, finds the most up-to-date mod version compatible with the given game version.
     Returns the direct Forge CDN download URL
     """
-    #TODO filter mods by forge/fabric compatibility
     try:
+        # This goes to the "all files" page, where we get a table view of all 
         ffx.get(url + '/files/all')
         mod_versions = ffx.find_elements_by_class_name("listing")[0].find_elements_by_xpath("tbody/tr") # extract the table of files from the page
         row_info = collections.namedtuple("row_info", ["type", "filename", "cdn_id", "game_version"]) # create a custom tuple because data
@@ -177,31 +246,52 @@ def find_cdn(ffx, url, version):
             # parse out the four fields that we use
             entry_cells = version_entry.find_elements_by_tag_name("td")
             release_type = entry_cells[0].text
+            # Note that this is NOT the final filename - this is just the "release name".
             filename = urllib.parse.quote(entry_cells[1].find_elements_by_tag_name("a")[0].text)
             try:
                 game_version = tuple([int(x) for x in entry_cells[4].find_element_by_class_name("mr-2").text.split(".")]) # get game version and convert to tuple
             except:
                 game_version = (0, 0, 0)
             cdn_id = entry_cells[1].find_element_by_tag_name("a").get_property("href").split("/")[-1]
-            rows.append(row_info(release_type, filename, cdn_id, game_version))
+
+            #TODO make this configurable
+            if 'fabric' not in filename.lower() or 'forge' in filename.lower():
+                rows.append(row_info(release_type, filename, cdn_id, game_version))
         rows.sort(key=lambda x: x.game_version, reverse=True)
         best_row = next(x for x in rows if x.game_version <= version)
 
-        return f'https://media.forgecdn.net/files/{best_row.cdn_id[:4]}/{best_row.cdn_id[4:]}/{best_row.filename}'
+        # We need to find the real, ForgeCDN compatible filename now by going to the file page.
+        ffx.get(f'{url}/files/{best_row.cdn_id}')
+        # This will probably break in the future
+        filename = ffx.find_elements_by_xpath("html/body/div/main/div/div/section/div/div/div/section/section/article/div/div/span")[1].text
+        # URL escape the filename!
+        filename = urllib.parse.quote(filename)
+
+        # ForgeCDN requires that the leading zeroes are stripped from each portion of the CDN ID, hence the int() cast.
+        return f'https://media.forgecdn.net/files/{int(best_row.cdn_id[:4])}/{int(best_row.cdn_id[4:])}/{filename}'
 
     except:
+        print(url)
+        open('temp.txt', 'a').write(url)
         import traceback; traceback.print_exc()
         return None
 
 
 def firefox():
-    print("Starting Selenium...")
+    """
+    Start a headless Firefox instance and return the Selenium refrence to it.
+    """
+    #print("Starting Selenium...")
     try:
         from selenium.webdriver import Firefox
+        from selenium.webdriver.firefox.options import Options
     except:
         print("Applying updates requires the `selenium` package")
-        os.exit(0)
-    return Firefox()
+        exit(0)
+    options = Options()
+    options.add_argument('-headless')
+    options.add_argument('--window-size 1920,1080')
+    return Firefox(executable_path='./geckodriver', options=options)
 
 COMMAND_MAP = {
     'install': install,
author	Dylan Jones <dylanjones2011@gmail.com>	2020-11-25 22:30:20 -0500
committer	Dylan Jones <dylanjones2011@gmail.com>	2020-11-25 22:30:20 -0500
commit	2c29fbad324c826e118be42e510a8388e4648ef5 (patch)
tree	2255f83ccb47235cc64cf0b319eb1a5439549e2b
parent	407d3016088d756020662cd9e2c44089ada6aefb (diff)
download	modpackman-2c29fbad324c826e118be42e510a8388e4648ef5.tar.gz modpackman-2c29fbad324c826e118be42e510a8388e4648ef5.zip