From 57cdc465b83f1cf9a2730aa15bd4b66e9c26729a Mon Sep 17 00:00:00 2001 From: scossa Date: Mon, 27 Nov 2023 03:45:23 +0100 Subject: [PATCH] update --- scrapeje.py | 445 +++++++++++++++++++++++++++++----------------------- 1 file changed, 247 insertions(+), 198 deletions(-) diff --git a/scrapeje.py b/scrapeje.py index 5c1a926..39e2427 100644 --- a/scrapeje.py +++ b/scrapeje.py @@ -9,13 +9,6 @@ import re import os import shutil -#Inizializzo liste -nome=[] -desc=[] -npezzi=[] -prezzo=[] -prezzoN=[] - restaurant_url='' def inputurl(): @@ -25,154 +18,274 @@ def inputurl(): print ("\nesempio: https://www.justeat.it/restaurants-pizzeria-la-garganica-bologna/menu") restaurant_url = input('INSERISCI IL LINK DELLA PAGINA DEL RISTORANTE: ') #PER DEBUG - #restaurant_url = 'https://www.justeat.it/restaurants-dolceirnerio/menu' + #restaurant_url = 'https://www.justeat.it/restaurants-pizzeriadelrondone-bologna/menu' -# if os.path.exists('rubrica.txt'): -# if os.path.exists('./DATI_RUBRICA'): -inputurl() -# wr = input('\nVuoi vedere la rubrica? [Y|N] ') -# if wr.upper() in ['YES', 'Y', 'SI', 'S']: -# print( '\n', os.listdir('./DATI_RUBRICA'), '\n') -# scelta = input("Scegli un numero esistente o premi Enter per metterre un link: ") -# files = os.listdir('./DATI_RUBRICA/') -# for file in files: -# if scelta in file: -# restaurant_url = (file) -# else: -# print ("numero inesistente") -# inputurl() -# else: -# inputurl() -# else: -# inputurl() -# else: -# inputurl() +nome=[] +desc=[] +npezzi=[] +prezzo=[] +prezzoN=[] +nrist='' +restaurant_address='' +tel='' +result='' +def stora_tutto(): + global nome + global desc + global npezzi + global prezzo + global prezzoN + global soup + global nrist + global restaurant_address + global tel + global result + #Stora nome ristorante + nrist = soup.title.text[8:-32] -#SCRAPE -driver = webdriver.Chrome() -driver.get(restaurant_url) + #Stora telefono del ristorante + pattern = re.compile(r'allergenPhoneNumber') + script_tags = soup.find_all('script', string=pattern) + pattern = re.compile(r'"allergenPhoneNumber":"(\d+)"') + tel = re.search(pattern, script_tags[0].next) + if tel: + tel = tel.group(1) -wait = WebDriverWait(driver, 16) -wait.until(EC.presence_of_element_located((By.CLASS_NAME, "c-menuItems-price"))) + #Stato ristorante + restaurant_is_open = menu = soup.find(attrs={"data-js-test":"order-status-wrapper"}).text + restaurant_is_open = restaurant_is_open.replace('\n', ' ') + regex = r" {4,}" + result = re.split(regex, restaurant_is_open) -page = driver.page_source + #indirizzo ristorante + restaurant_address = soup.find(attrs={"data-js-test":"header-restaurantAddress"}).text -with open('JEmenu.html', 'w') as f: - f.write(page) + #cicla le schede prodotto + menu = soup.find(attrs={"data-test-id": "menu-item"}) + for menu in soup.find_all(attrs={"data-test-id": "menu-item"}): + att=menu + #riempie la lista "nome" + for att in menu.find(attrs={"data-test-id": "menu-item-name"}): + if att != type(None): + if att != " ": + nome.append(att.lstrip().splitlines()[0]) + break + else: + continue - -#PARSER -with open('JEmenu.html', 'r') as f: - page = f.read() - -soup = BeautifulSoup(page, "html.parser") - - -#Stora nome ristorante -nrist = soup.title.text[8:-32] - -#Stora telefono del ristorante -pattern = re.compile(r'allergenPhoneNumber') -script_tags = soup.find_all('script', string=pattern) -pattern = re.compile(r'"allergenPhoneNumber":"(\d+)"') -tel = re.search(pattern, script_tags[0].next) -if tel: - tel = tel.group(1) - -#Stato ristorante -restaurant_is_open = menu = soup.find(attrs={"data-js-test":"order-status-wrapper"}).text -restaurant_is_open = restaurant_is_open.replace('\n', ' ') -regex = r" {4,}" -result = re.split(regex, restaurant_is_open) - -#indirizzo ristorante -restaurant_address = soup.find(attrs={"data-js-test":"header-restaurantAddress"}).text - -#cicla le schede prodotto -menu = soup.find(attrs={"data-test-id": "menu-item"}) -for menu in soup.find_all(attrs={"data-test-id": "menu-item"}): - att=menu - #riempie la lista "nome" - for att in menu.find(attrs={"data-test-id": "menu-item-name"}): - if att != type(None): - if att != " ": - nome.append(att.lstrip().splitlines()[0]) - break + #riempie la lista "desc" + att=menu.find("p", class_="c-menuItems-description") + if att != None: + for att in menu.find("p", class_="c-menuItems-description"): + desc.append(att.lstrip().splitlines()[0]) else: - continue + desc.append(None) - #riempie la lista "desc" - att=menu.find("p", class_="c-menuItems-description") - if att != None: - for att in menu.find("p", class_="c-menuItems-description"): - desc.append(att.lstrip().splitlines()[0]) - else: - desc.append(None) - - #riempie la lista "prezzo" - for att in menu.find(attrs={"data-js-test": "menu-item-price"}): - #prezzo.append(att.lstrip().splitlines()[0]) - counter=0 - if att != " " and counter % 2 == 0: - prezzo.append(att.lstrip()) - counter+=1 - else: - counter+=1 + #riempie la lista "prezzo" + for att in menu.find(attrs={"data-js-test": "menu-item-price"}): + #prezzo.append(att.lstrip().splitlines()[0]) + counter=0 + if att != " " and counter % 2 == 0: + prezzo.append(att.lstrip()) + counter+=1 + else: + counter+=1 + continue continue + + #riempie la lista "npezzi" + att=menu.find_all(attrs={"data-test-id": "menu-item-description"}) + if att != None: + if menu.text.find("pezzo") > 0 or menu.text.find("pezzi") > 0: + npezzi.append(menu.text.splitlines()[7].lstrip()) + else: + npezzi.append(None) continue + + +driver='' +page = '' +def scraper(): + global driver + global page + global restaurant_url + driver = webdriver.Chrome() + driver.get(restaurant_url) + + wait = WebDriverWait(driver, 16) + wait.until(EC.presence_of_element_located((By.CLASS_NAME, "c-menuItems-price"))) + + page = driver.page_source + + with open('JEmenu.html', 'w') as f: + f.write(page) + +soup = '' +def parser(): + global soup + with open('JEmenu.html', 'r') as f: + page = f.read() + + soup = BeautifulSoup(page, "html.parser") - #riempie la lista "npezzi" - att=menu.find_all(attrs={"data-test-id": "menu-item-description"}) - if att != None: - if menu.text.find("pezzo") > 0 or menu.text.find("pezzi") > 0: - npezzi.append(menu.text.splitlines()[7].lstrip()) +def parserdarubrica(): + global parser + global stora_tutto + folder_path = ('./DATI_RUBRICA/') + for filename in os.listdir(folder_path): + if filename.startswith(scelta): + print(filename) + filename = filename.replace(' ', ' ') + print(filename) + shutil.copy (folder_path + filename, f'./JEmenu.html') + parser() + stora_tutto() else: - npezzi.append(None) - continue + print ("numero inesistente") + continue -#Chiude chromium -driver.quit() +def stampa_liste(): + print("\n") + for x in range(len(nome)): + # print("\n") + print(nome[x]) + print(desc[x]) + print(npezzi[x]) + print(prezzo[x]) -# #stampa liste -print("\n") -for x in range(len(nome)): - # print("\n") - print(nome[x]) - print(desc[x]) - print(npezzi[x]) - print(prezzo[x]) +def stampa_info(): + print("-" * (len(desc)) + "\n") + print(nrist) + print(restaurant_address.strip()) + print("Telefono:",tel,"\n") -#stampa info ristorante -print("-" * (len(desc)) + "\n") -print(nrist) -print(restaurant_address.strip()) -print("Telefono:",tel,"\n") + doppione = "" + for i in range(len(result)): + if re.search(r"[a-zA-Z]", result[i]): + if (result[i]) == doppione: + continue + else: + print(result[i]) + elif re.search(r"\d", result[i]): + print(result[i], result[i+1]) + doppione = (result[i+1]) + print("\n" + "-" * (len(desc))+ "\n") -doppione = "" -for i in range(len(result)): - if re.search(r"[a-zA-Z]", result[i]): - if (result[i]) == doppione: - continue +def genera_prezzoN(): + #Genera la lista prezzoN[] che è un clone di "prezzo[] ma con i valori float anzichè string" + prezzoN = prezzo.copy() + for i in range(len(prezzo)): + if "€" in prezzo[i]: + prezzoN[i] = float(prezzo[i].replace("€", "").replace(",", ".").replace("da ", "")) + elif "Non" in prezzo[i]: + prezzoN[i] = 0 + prezzoN[i] = float(prezzoN[i]) else: - print(result[i]) - elif re.search(r"\d", result[i]): - print(result[i], result[i+1]) - doppione = (result[i+1]) -print("\n" + "-" * (len(desc))+ "\n") + prezzoN[i] = 99999 -#Genera la lista prezzoN[] che è un clone di "prezzo[] ma con i valori float anzichè string" -prezzoN = prezzo.copy() -for i in range(len(prezzo)): - if "€" in prezzo[i]: - prezzoN[i] = float(prezzo[i].replace("€", "").replace(",", ".").replace("da ", "")) - elif "Non" in prezzo[i]: - prezzoN[i] = 0 - prezzoN[i] = float(prezzoN[i]) + +def salvainrubrica(): + maxn=0 + def trova_nuovo_numero(): + global maxn + file_list = os.listdir('./DATI_RUBRICA') + number_list = [] + # Estare il numero + for file_name in file_list: + if file_name[0].isdigit(): + number_list.append(int(file_name.split('-')[0])) + # trova il massimo + if number_list: + max_number = max(number_list) + maxn=(max_number + 1) + + if os.path.exists('rubrica.txt'): + #SE LA RUBRICA ESISTE + with open('rubrica.txt', 'a+') as rubrica: + rubrica.seek(0) + data = rubrica.read() + if restaurant_url not in data: + #QUANDO IL RISTORATE NON E' PRESENTE IN RUBRICA + saveit = input('Vuoi salvare il ristorante in rubrica? [Y|N] ') + #PER DEBUG + #saveit = "y" + if saveit.upper() in ['YES', 'Y', 'SI', 'S']: + os.makedirs("DATI_RUBRICA", exist_ok=True) + trova_nuovo_numero() + nristmax=(str(maxn) + "-" + nrist + '.html') + shutil.move ('JEmenu.html', f'./DATI_RUBRICA/{nristmax}') + if data: + rubrica.write('\n') + rubrica.write(nristmax + '\n') + rubrica.write(nrist + '\n') + rubrica.write(restaurant_url + '\n') + else: + #QUANDO IL RISTORATE E' GIA' PRESENTE IN RUBRICA + os.remove("JEmenu.html") + else: + #QUANDO IL RISTORATE E' GIA' PRESENTE IN RUBRICA + os.remove("JEmenu.html") + else: - prezzoN[i] = 99999 + #SE LA RUBRICA NON ESISTE + with open('rubrica.txt', 'a+') as rubrica: + rubrica.seek(0) + data = rubrica.read() + #CHIEDE SE SI VUOLE SALVARE ALTRIMENTI PULISCE + saveit = input('Vuoi salvare il ristorante in rubrica? [Y|N] ') + #PER DEBUG + #saveit = "y" + if saveit.upper() in ['YES', 'Y', 'SI', 'S']: + os.makedirs("DATI_RUBRICA", exist_ok=True) + nristmax=("1" + "-" + nrist + '.html') + shutil.move ('JEmenu.html', f'./DATI_RUBRICA/{nristmax}') + if data: + rubrica.write('\n') + rubrica.write(nristmax + '\n') + rubrica.write(nrist + '\n') + rubrica.write(restaurant_url + '\n') + else: + #PULISCE + os.remove("JEmenu.html") + os.remove("rubrica.txt") + +############################################################################################# +############################################################################################# +############################################################################################# + + +if os.path.exists('rubrica.txt') and os.path.exists('./DATI_RUBRICA'): + + wr = input('\nVuoi vedere la rubrica? [Y|N] ') + + if wr.upper() in ['YES', 'Y', 'SI', 'S']: + print( '\n', os.listdir('./DATI_RUBRICA'), '\n') + + scelta = input("Scegli un numero esistente o premi Enter per metterre un link: ") + + parserdarubrica() + + else: + inputurl() + scraper() + parser() + stora_tutto() + driver.quit() +else: + inputurl() + scraper() + parser() + stora_tutto() + driver.quit() + +stampa_liste() +stampa_info() +genera_prezzoN() +salvainrubrica() + # #PER DEBUG @@ -223,68 +336,4 @@ for i in range(len(prezzo)): -#PROPORRE DI SALVARE IN RUBRICA - -maxn=0 -def trova_nuovo_numero(): - global maxn - file_list = os.listdir('./DATI_RUBRICA') - number_list = [] - # Estare il numero - for file_name in file_list: - if file_name[0].isdigit(): - number_list.append(int(file_name.split('-')[0])) - # trova il massimo - if number_list: - max_number = max(number_list) - maxn=(max_number + 1) - -if os.path.exists('rubrica.txt'): - #SE LA RUBRICA ESISTE - with open('rubrica.txt', 'a+') as rubrica: - rubrica.seek(0) - data = rubrica.read() - if restaurant_url not in data: - #QUANDO IL RISTORATE NON E' PRESENTE IN RUBRICA - saveit = input('Vuoi salvare il ristorante in rubrica? [Y|N] ') - #PER DEBUG - #saveit = "y" - if saveit.upper() in ['YES', 'Y', 'SI', 'S']: - os.makedirs("DATI_RUBRICA", exist_ok=True) - trova_nuovo_numero() - nristmax=(str(maxn) + "-" + nrist + '.html') - shutil.move ('JEmenu.html', f'./DATI_RUBRICA/{nristmax}') - if data: - rubrica.write('\n') - rubrica.write(nristmax + '\n') - rubrica.write(nrist + '\n') - rubrica.write(restaurant_url + '\n') - else: - #QUANDO IL RISTORATE E' GIA' PRESENTE IN RUBRICA - os.remove("JEmenu.html") - else: - #QUANDO IL RISTORATE E' GIA' PRESENTE IN RUBRICA - os.remove("JEmenu.html") - -else: - #SE LA RUBRICA NON ESISTE - with open('rubrica.txt', 'a+') as rubrica: - rubrica.seek(0) - data = rubrica.read() - #CHIEDE SE SI VUOLE SALVARE ALTRIMENTI PULISCE - saveit = input('Vuoi salvare il ristorante in rubrica? [Y|N] ') - #PER DEBUG - #saveit = "y" - if saveit.upper() in ['YES', 'Y', 'SI', 'S']: - os.makedirs("DATI_RUBRICA", exist_ok=True) - nristmax=("1" + "-" + nrist + '.html') - shutil.move ('JEmenu.html', f'./DATI_RUBRICA/{nristmax}') - if data: - rubrica.write('\n') - rubrica.write(nristmax + '\n') - rubrica.write(nrist + '\n') - rubrica.write(restaurant_url + '\n') - else: - #PULISCE - os.remove("JEmenu.html") - os.remove("rubrica.txt") +