defi-rendu-legifrance/fetch.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov  5 20:55:36 2018

@author: suwako & thedevkiller
"""

import requests
import re


# Variables
api = "https://master.apis.dev.openstreetmap.org/"  # Testing, it should be replaced by https://api.openstreetmap.org/ when the program is finished


def runfetch(url='https://www.legifrance.gouv.fr/eli/arrete/2018/10/12/PRMD1824595A/jo/texte'):
    session = requests.Session()
    req1 = session.get(url)  # Get the source code
    text = req1.text.split('\n')
    # Parse the HTML source code
    text = ['\n'.join(row.split('\n')[:row.split('\n').index("</tr>"):]) for row in '\n'.join(text[text.index('<br/>ANNEXES<br/>ANNEXE I</p>'):text.index('<div style="margin-top: 30px; margin-bottom:20px;" id="JORFSCTA000037493059" class="titreSection">Annexe </div>'):]).split('<tr>')[2::]]
    diclist = []
    for row in text:
        cols = [text.split('\n')[2][5::] if len(text.split('\n')) == 3 else "" for text in row.split("</td")[::]]
        diclist.append({"id": cols[0], "commune": cols[1], "site": cols[2], "departement": cols[3], "zone": cols[4], "ministere": cols[5], "aerozone": cols[6]})
#       dicformat:{id,commune,site,departement,zone,ministere,aerozone}
    return diclist


def dms2dd(dms):
    coordslst = re.search("(\d{1,3})° (\d{2})(?:'|′) (\d{2}(?:,|.)?\d{0,3})(?:\"|”|\'\') ?(.)", dms).groups()
    dd = float(coordslst[0].replace(",", ".")) + float(coordslst[1].replace(",", "."))/60 + float(coordslst[2].replace(",", "."))/3600
    if coordslst[3] in ["n", "e"]:
        return dd
    if coordslst[3] in ["s", "o"]:
        return -dd


def fetch(url='https://www.legifrance.gouv.fr/eli/arrete/2018/10/12/PRMD1824595A/jo/texte'):
    diclist = runfetch(url=url)
    for index, zone in enumerate(diclist):
        for element in zone:
            diclist[index][element] = re.sub("<br.{0,2}>", "\n", diclist[index][element])  # Replace <br>, <br /> and <br/> by \n

    # Make a the area
    for area in diclist:
        # Case insensitive
        area["zone"] = area["zone"].lower()
        # Multiple polygons
        if "polygones" in area["zone"]:
            lst = []
            tmp = re.compile(".*(?:zone|csg).*").split(area["zone"])  # List of polygons
            del tmp[0]
            for polygon in tmp:
                lst.append(re.findall(".{0,35} ?:? ?.? ?: ?(\d{1,3}° \d{2}(?:'|′) \d{2},?\d{0,3}(?:\"|”|\'\') .) ?\/ ?(\d{1,3}° \d{2}(?:'|′|\'\') \d{2},?\d{0,3}(?:\"|”) .)", polygon))
            area["zone"] = []
            for index, polygon in enumerate(lst):
                area["zone"].append([])
                for index2, coords in enumerate(polygon):
                    area["zone"][index].append([])
                    for index3, point in enumerate(coords):
                        area["zone"][index][index2].append(dms2dd(point))
                        # area["zone"][index][index2][index3] = dms2dd(point)
            del lst
            del tmp
        # Polygons
        elif "polygone" in area["zone"]:
            lst = []
            lst.append(re.findall(".{0,35} ?:? ?.? ?: ?(\d{1,3}° \d{2}(?:'|′) \d{2}(?:,|.)?\d{0,3}(?:\"|”|\'\') ?.) ?\/ ?(\d{1,3}° \d{2}(?:'|′|\'\') \d{2}(?:,|.)?\d{0,3}(?:\"|”|\'\') ?.)", area["zone"]))
            area["zone"] = []
            for index, polygon in enumerate(lst):
                area["zone"].append([])
                for index2, coords in enumerate(polygon):
                    area["zone"][index].append([])
                    for index3, point in enumerate(coords):
                        area["zone"][index][index2].append([])
                        area["zone"][index][index2][index3] = dms2dd(point)
            del lst
        # Circles
        elif "cercle" in area["zone"]:
            lst = [[]]
            lst[0] = list(re.search(r"(.{6}) .{0,5}?(\d{1,3},?\d{0,2} ?.{1,2}) .{1,35} ?(\d{3}° \d{1,2}' \d{1,2}” .) ?\/ ?(\d{1,3}° \d{1,2}' \d{1,2}” .)", area["zone"], re.S).groups())
            if "km" in lst[0][1]:
                lst[0][1] = float(re.search("(\d*,?\d*).*", lst[0][1]).groups()[0].replace(",", "."))*1000
            else:
                lst[0][1] = float(re.search("(\d*,?\d*).*", lst[0][1]).groups()[0].replace(",", "."))
            lst[0][2] = dms2dd(lst[0][2])
            lst[0][3] = dms2dd(lst[0][3])
            area["zone"] = lst
            del lst
        # Atolls
        elif area["zone"].strip() == "atolls et eaux territoriales incluses":
            area["zone"] = [("cercle", 35000.0, -138.9022, -21.82917), ("cercle", 27000, -138.7425, -22.23528)]
    return diclist


if __name__ == '__main__':

    diclist = fetch()  # Fetch all the data from a french article

    # Print the dict (keep this code in the end of the file)
    print("===== Dict =====")
    for index, area in enumerate(diclist):
        print(f"\n-------------{index}----------------\n")
        for element in area:
            print(f"    {element}: {diclist[index][element]}")
    print("\n")
-												Checkpoint - implementing the scrapper (WIP)

											
										
										
											2018-11-05 22:50:53 +01:00
+								#!/usr/bin/env python3
 								# -*- coding: utf-8 -*-
 								"""
 								Created on Mon Nov  5 20:55:36 2018
-												[Plotgen] added support for polygons

											
										
										
											2018-11-11 14:22:57 +01:00
+								@author: suwako & thedevkiller
-												Checkpoint - implementing the scrapper (WIP)

											
										
										
											2018-11-05 22:50:53 +01:00
+								"""
-												Improved readabilty (i guess)

											
										
										
											2018-11-06 22:13:18 +01:00
-												Checkpoint - implementing the scrapper (WIP)

											
										
										
											2018-11-05 22:50:53 +01:00
+								import requests
-												Added code to replace <br/> by \n in positions

											
										
										
											2018-11-07 23:32:46 +01:00
+								import re
-												Improved readabilty (i guess)

											
										
										
											2018-11-06 22:13:18 +01:00
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
+								# Variables
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
+								api = "https://master.apis.dev.openstreetmap.org/"  # Testing, it should be replaced by https://api.openstreetmap.org/ when the program is finished
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
+								def runfetch(url='https://www.legifrance.gouv.fr/eli/arrete/2018/10/12/PRMD1824595A/jo/texte'):
 								    session = requests.Session()
 								    req1 = session.get(url)  # Get the source code
 								    text = req1.text.split('\n')
-												Added code to replace <br/> by \n in positions

											
										
										
											2018-11-07 23:32:46 +01:00
+								    # Parse the HTML source code
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
+								    text = ['\n'.join(row.split('\n')[:row.split('\n').index("</tr>"):]) for row in '\n'.join(text[text.index('<br/>ANNEXES<br/>ANNEXE I</p>'):text.index('<div style="margin-top: 30px; margin-bottom:20px;" id="JORFSCTA000037493059" class="titreSection">Annexe </div>'):]).split('<tr>')[2::]]
 								    diclist = []
-												Checkpoint - implementing the scrapper (WIP)

											
										
										
											2018-11-05 22:50:53 +01:00
+								    for row in text:
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
+								        cols = [text.split('\n')[2][5::] if len(text.split('\n')) == 3 else "" for text in row.split("</td")[::]]
 								        diclist.append({"id": cols[0], "commune": cols[1], "site": cols[2], "departement": cols[3], "zone": cols[4], "ministere": cols[5], "aerozone": cols[6]})
 								#       dicformat:{id,commune,site,departement,zone,ministere,aerozone}
-												Checkpoint - Function now returns a dic - WIP

											
										
										
											2018-11-06 20:55:47 +01:00
+								    return diclist
-												Improved readabilty (i guess)

											
										
										
											2018-11-06 22:13:18 +01:00
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
+								def dms2dd(dms):
-												Fixed negative/positives coords

											
										
										
											2018-11-11 16:39:26 +01:00
+								    coordslst = re.search("(\d{1,3})° (\d{2})(?:'|′) (\d{2}(?:,|.)?\d{0,3})(?:\"|”|\'\') ?(.)", dms).groups()
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
+								    dd = float(coordslst[0].replace(",", ".")) + float(coordslst[1].replace(",", "."))/60 + float(coordslst[2].replace(",", "."))/3600
-												Fixed bug

											
										
										
											2018-11-11 17:36:24 +01:00
+								    if coordslst[3] in ["n", "e"]:
-												Fixed negative/positives coords

											
										
										
											2018-11-11 16:39:26 +01:00
+								        return dd
-												Fixed bug

											
										
										
											2018-11-11 17:36:24 +01:00
+								    if coordslst[3] in ["s", "o"]:
-												Fixed negative/positives coords

											
										
										
											2018-11-11 16:39:26 +01:00
+								        return -dd
-												Improved readabilty (i guess)

											
										
										
											2018-11-06 22:13:18 +01:00
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
+								def fetch(url='https://www.legifrance.gouv.fr/eli/arrete/2018/10/12/PRMD1824595A/jo/texte'):
 								    diclist = runfetch(url=url)
-												Added code to replace <br/> by \n in positions

											
										
										
											2018-11-07 23:32:46 +01:00
+								    for index, zone in enumerate(diclist):
 								        for element in zone:
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
+								            diclist[index][element] = re.sub("<br.{0,2}>", "\n", diclist[index][element])  # Replace <br>, <br /> and <br/> by \n
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
-												Made lists of tuples with the corners of the polygon

											
										
										
											2018-11-08 00:06:37 +01:00
+								    # Make a the area
 								    for area in diclist:
-												Fixed some regex

											
										
										
											2018-11-09 23:04:19 +01:00
+								        # Case insensitive
 								        area["zone"] = area["zone"].lower()
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
+								        # Multiple polygons
-												Fixed some regex

											
										
										
											2018-11-09 23:04:19 +01:00
+								        if "polygones" in area["zone"]:
 								            lst = []
-												Fixed polygons split regex

											
										
										
											2018-11-12 00:29:17 +01:00
+								            tmp = re.compile(".*(?:zone|csg).*").split(area["zone"])  # List of polygons
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
+								            del tmp[0]
-												Fixed some regex

											
										
										
											2018-11-09 23:04:19 +01:00
+								            for polygon in tmp:
-												Fixed some bugs with regex

											
										
										
											2018-11-11 14:40:17 +01:00
+								                lst.append(re.findall(".{0,35} ?:? ?.? ?: ?(\d{1,3}° \d{2}(?:'|′) \d{2},?\d{0,3}(?:\"|”|\'\') .) ?\/ ?(\d{1,3}° \d{2}(?:'|′|\'\') \d{2},?\d{0,3}(?:\"|”) .)", polygon))
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
+								            area["zone"] = []
 								            for index, polygon in enumerate(lst):
 								                area["zone"].append([])
 								                for index2, coords in enumerate(polygon):
 								                    area["zone"][index].append([])
 								                    for index3, point in enumerate(coords):
 								                        area["zone"][index][index2].append(dms2dd(point))
 								                        # area["zone"][index][index2][index3] = dms2dd(point)
-												Fixed some regex

											
										
										
											2018-11-09 23:04:19 +01:00
+								            del lst
 								            del tmp
-												Made lists of tuples with the corners of the polygon

											
										
										
											2018-11-08 00:06:37 +01:00
+								        # Polygons
-												Fixed some regex

											
										
										
											2018-11-09 23:04:19 +01:00
+								        elif "polygone" in area["zone"]:
 								            lst = []
-												Fixed some bugs with regex

											
										
										
											2018-11-11 14:58:43 +01:00
+								            lst.append(re.findall(".{0,35} ?:? ?.? ?: ?(\d{1,3}° \d{2}(?:'|′) \d{2}(?:,|.)?\d{0,3}(?:\"|”|\'\') ?.) ?\/ ?(\d{1,3}° \d{2}(?:'|′|\'\') \d{2}(?:,|.)?\d{0,3}(?:\"|”|\'\') ?.)", area["zone"]))
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
+								            area["zone"] = []
 								            for index, polygon in enumerate(lst):
 								                area["zone"].append([])
 								                for index2, coords in enumerate(polygon):
 								                    area["zone"][index].append([])
 								                    for index3, point in enumerate(coords):
 								                        area["zone"][index][index2].append([])
 								                        area["zone"][index][index2][index3] = dms2dd(point)
-												Added regex to match with circles and corrected polygons regex

											
										
										
											2018-11-08 21:22:44 +01:00
+								            del lst
-												Fixed some regex

											
										
										
											2018-11-09 23:04:19 +01:00
+								        # Circles
 								        elif "cercle" in area["zone"]:
-												Converted km to m

											
										
										
											2018-11-11 15:18:18 +01:00
+								            lst = [[]]
 								            lst[0] = list(re.search(r"(.{6}) .{0,5}?(\d{1,3},?\d{0,2} ?.{1,2}) .{1,35} ?(\d{3}° \d{1,2}' \d{1,2}” .) ?\/ ?(\d{1,3}° \d{1,2}' \d{1,2}” .)", area["zone"], re.S).groups())
 								            if "km" in lst[0][1]:
-												Fixed problem in km to m conversion (i'm very dumb)

											
										
										
											2018-11-11 16:11:20 +01:00
+								                lst[0][1] = float(re.search("(\d*,?\d*).*", lst[0][1]).groups()[0].replace(",", "."))*1000
-												Transformed m to float

											
										
										
											2018-11-11 16:29:00 +01:00
+								            else:
 								                lst[0][1] = float(re.search("(\d*,?\d*).*", lst[0][1]).groups()[0].replace(",", "."))
-												Converted dms to dd for circles

											
										
										
											2018-11-11 16:07:07 +01:00
+								            lst[0][2] = dms2dd(lst[0][2])
 								            lst[0][3] = dms2dd(lst[0][3])
-												Made lists of tuples with the corners of the polygon

											
										
										
											2018-11-08 00:06:37 +01:00
+								            area["zone"] = lst
 								            del lst
-												Added Mururoa and Fangataufa atolls

											
										
										
											2018-11-10 15:03:45 +01:00
+								        # Atolls
 								        elif area["zone"].strip() == "atolls et eaux territoriales incluses":
-												Fixed atolls coordinates

											
										
										
											2018-11-11 17:44:28 +01:00
+								            area["zone"] = [("cercle", 35000.0, -138.9022, -21.82917), ("cercle", 27000, -138.7425, -22.23528)]
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
+								    return diclist
-												Converted DMS (degrees, minutes, seconds) to DD (decimal degrees)

											
										
										
											2018-11-11 00:32:33 +01:00
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
+								if __name__ == '__main__':
 								    diclist = fetch()  # Fetch all the data from a french article
-												Made lists of tuples with the corners of the polygon

											
										
										
											2018-11-08 00:06:37 +01:00
 								    # Print the dict (keep this code in the end of the file)
-												Improved the dict print

											
										
										
											2018-11-07 23:10:52 +01:00
+								    print("===== Dict =====")
-												Made lists of tuples with the corners of the polygon

											
										
										
											2018-11-08 00:06:37 +01:00
+								    for index, area in enumerate(diclist):
-												Transformed m to float

											
										
										
											2018-11-11 16:29:00 +01:00
+								        print(f"\n-------------{index}----------------\n")
 								        for element in area:
 								            print(f"    {element}: {diclist[index][element]}")
-												Moved interpreter code into a function and ran flake8 (except for E501)

											
										
										
											2018-11-11 09:39:34 +01:00
+								    print("\n")