import json import os from fs.osfs import OSFS import re import requests fileSystem = None def format_string(text): text = text.replace(" ", "").replace("-", "").replace("_", "").lower() return str(text) def get_all_item_urls(): page = requests.get("https://deeptownguide.com/Items") item_urls = [] if page.status_code == 200: regex = re.compile(r"/Items/Details/[0-9]+/([a-zA-Z0-9]|-)*", re.MULTILINE) item_urls_match = regex.finditer(str(page.content)) for match in item_urls_match: if "https://deeptownguide.com" + match.group(0) not in item_urls: item_urls.append("https://deeptownguide.com" + match.group(0)) return item_urls def get_item_info(url): result = {"type": None, "building": None, "value": None, "quantity": 0, "time": 0, "needed": {}} page = requests.get(url) texte = str(page.content).replace(" ", "").replace("\n", "").replace(r"\n", "") # regex used to find infos type_regex = re.compile(r"Type
\w*") value_regex = re.compile(r"SellPrice
([0-9]|,)*") building_regex = re.compile(r"\w*iscreatedfromthisrecipe" r"BuildingNameUnlockedatDepthCost" r"ToUnlockTimeRequiredAmountCreatedItemsRequired\w*iscreatedfromthisrecipeBuildingNameUnlockedatDepthCostToUnlockTimeRequiredAmountCreatedItemsRequired\w*[0-" r"9]*([0-9]|,)*([0-9]+|Seconds?|Minutes?|Hours?)+") quantity_regex = re.compile(r"\w*iscreatedfromthisrecipe" r"BuildingNameUnlockedatDepthCost" r"ToUnlockTimeRequiredAmountCreatedItemsRequired\w*[0-9]*([0-9]|,)*([0-9]+|Seconds?|Minutes?|" r"Hours?)+[0-9]+") needed_regex = re.compile(r"((\w|,)+
)+") type_iter = type_regex.finditer(str(texte)) value_iter = value_regex.finditer(str(texte)) building_iter = building_regex.finditer(str(texte)) time_iter = time_regex.finditer(str(texte)) quantity_iter = quantity_regex.finditer(str(texte)) needed_iter = needed_regex.finditer(str(texte)) # Extract value from regex result result["type"] = format_string(re.sub(r"Type
", "", str(type_iter.__next__().group(0)))) result["value"] = int( re.sub(r"SellPrice
", "", str(value_iter.__next__().group(0))).replace( ",", "")) # Extract for recipe try: result["building"] = format_string(re.sub( r"\w*iscreatedfromthisrecipe" r"BuildingNameUnlockedatDepthCost" r"ToUnlockTimeRequiredAmountCreatedItemsRequired\w*iscreatedfromthisrecipeBuildingNameUnlockedatDepthCostToUnlockTimeRequiredAmountCreatedItemsRequired\w*[0-" r"9]*([0-9]|,)*", "", str(time_iter.__next__().group(0)))) # Time: time_str = time_str.replace("s", "") # remove plural time_list = re.split("([0-9]+)", time_str) if time_list[0] == '': del time_list[0] time = 0 for number, unit in zip(time_list[::2], time_list[1::2]): if unit == "Second": time += int(number) elif unit == "Minute": time += int(number) * 60 elif unit == "Hour": time += int(number) * 60 * 60 result['time'] = int(time) result["quantity"] = int(str(re.sub("\w*iscrea" "tedfromthisrecipeBuild" "ingNameUnlockedatDepthCostToUnlockTimeRequired<" "/th>AmountCreatedItemsRequired\w*([0-9]|,)*([0-9]|,)*([0-9]+|Seconds?" "|Minutes?|Hours?)+", "", quantity_iter.__next__().group(0)))) needed_text = re.sub(r"", "", needed_iter.__next__().group(0)) item_name_iter = re.finditer(r"[A-Za-z]+([0-9]|,)+", str(needed_text)) for item_name_match, item_quantity_match in zip(item_name_iter, item_quantity_iter): item_name = re.sub(r"[A-Za-z]+", "", item_quantity_match.group(0)).replace(",", "").replace( ".", "")) result["needed"].update({format_string(item_name): item_quantity}) except StopIteration: pass return result def get_sector_info(): page = requests.get("https://deeptownguide.com/Areas/Resources") texte = str(page.content).replace(" ", "").replace("\n", "").replace(r"\n", "") line_regex = re.compile(r"[0-9]+((
\w*
([0-9]|\.|%)+| ))+") num_regex = re.compile(r"[0-9]+") item_regex = re.compile(r"(
\w*
([0-9]|\.|%)+| )" r"") item_name_regex = re.compile(r"(([0-9]|\.)+") line_iter = line_regex.finditer(texte) etages = {} liste_items = [] for line in line_iter: etage_iter = num_regex.finditer(line.group(0)) etage = int(re.sub(r"", "", etage_iter.__next__().group(0))) item_iter = item_regex.finditer(line.group(0)) items = {} for item in item_iter: name_iter = item_name_regex.finditer(item.group(0)) name = str(re.sub(r"(", "", quantity_iter.__next__().group(0))) / 100 items.update({name: quantity}) if name not in liste_items: liste_items.append(name) etages.update({str(etage): items}) etages.update({"0": {name: 0 for name in liste_items}}) return etages def update_data(): items = {} urls_item = get_all_item_urls() print(len(urls_item)) a = 0 for item_url in urls_item: a += 1 items.update({ str(format_string(re.sub("https://deeptownguide.com/Items/Details/[0-9]+/", "", item_url))): get_item_info(item_url) }) print(a * 100 / len(urls_item), "%") with open('items.json', "w") as dest_file: json.dump(items, dest_file) with open('mines.json', "w") as dest_file: json.dump(get_sector_info(), dest_file) return None if __name__ == "__main__": print(get_item_info('https://deeptownguide.com/Items/Details/702/stage-ii')) update_data()