scrapeje.py

import re
#import requests
from bs4 import BeautifulSoup
#import cloudscraper
#import json
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


nome=[]
desc=[]
npezzi=[]
prezzo=[]
prezzof=[]
scripto=[]


#prende l url della pagina justeat del ristorante in input
#url = "https://www.justeat.it/restaurants-pizzeria-girasole-bologna/menu" 
#url = input('link della pagina justeat del ristorante: ')
#input ('inserisci il link della pagina justeat del ristorante: ')

# print("\n")
# url = print(Fore.WHITE + Style.DIM + "es https://www.justeat.it/NOME_RISTORANTE/menu" + Style.RESET_ALL)
# url = input("inserisci il link della pagina justeat del ristorante: ")


#scrape html scavalcando cloudflare
# scraper = cloudscraper.create_scraper(browser={'browser': 'firefox','platform': 'windows','mobile': False})
# page = scraper.get(url).content #usa input manuale
# page = scraper.get("https://www.justeat.it/restaurants-saporedialeppo/menu").content #usa input automatico

#crea il file html
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# page = requests.get(url, headers=headers).content


restaurant_url = "https://www.justeat.it/restaurants-saporedialeppo/menu"
restaurant_url = 'https://www.justeat.it/restaurants-gelateria-ice-cream-casalecchio-di-reno-40033/menu'
restaurant_url = 'https://www.justeat.it/restaurants-pizzeria-la-garganica-bologna/menu'
#restaurant_url = 'https://www.justeat.it/restaurants-viavaipizzaekebab/menu'
driver = webdriver.Chrome()
driver.get(restaurant_url)

wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "c-menuItems-price--offline")))


page = driver.page_source


with open('JEmenu.html', 'w') as f:
    f.write(page)

#apre e legge il file
with open('JEmenu.html', 'r') as f:
    page = f.read()

#parser
soup = BeautifulSoup(page, "html.parser")

#Stora nome ristorante
nrist = soup.title.text[8:-32]


# #Stora telefono del ristorante

# Define the regex pattern
pattern = re.compile(r'allergenPhoneNumber')

# Find all script tags that match the regex pattern
script_tags = soup.find_all('script', text=pattern)

# Define the regex pattern
pattern = re.compile(r'"allergenPhoneNumber":"(\d+)"')

# Search for the phone number in the given string
tel = re.search(pattern, script_tags[0].next)
if tel:
    tel = tel.group(1)


#Stato ristorante
restaurant_is_open = menu = soup.find(attrs={"data-js-test":"order-status-wrapper"}).text
restaurant_is_open = restaurant_is_open.replace('\n', ' ')

regex = r" {4,}"
result = re.split(regex, restaurant_is_open)

restaurant_address = soup.find(attrs={"data-js-test":"header-restaurantAddress"}).text

#cicla le schede prodotto
menu = soup.find(attrs={"data-test-id": "menu-item"})
for menu in soup.find_all(attrs={"data-test-id": "menu-item"}):
    att=menu
    #riempie la lista "nome" 
    for att in menu.find(attrs={"data-test-id": "menu-item-name"}):
        if att != type(None):
            if att != " ":
                nome.append(att.lstrip().splitlines()[0])
                break
        else:
            continue


    #riempie la lista "desc" 
    att=menu.find("p", class_="c-menuItems-description")
    if att != None:
        for att in menu.find("p", class_="c-menuItems-description"):
            desc.append(att.lstrip().splitlines()[0])
    else:
        desc.append(None)

    #riempie la lista "prezzo" 
    for att in menu.find(attrs={"data-js-test": "menu-item-price"}):
        prezzo.append(att.lstrip().splitlines()[0])


    #riempie la lista "npezzi" 
    att=menu.find_all(attrs={"data-test-id": "menu-item-description"})
    if att != None:
        if menu.text.find("pezzo") > 0 or menu.text.find("pezzi") > 0:
            npezzi.append(menu.text.splitlines()[7].lstrip())
        else:
            npezzi.append(None)
    continue

# #stampa liste
for x in range(len(nome)):
     print("\n")
     print(nome[x])
     print(desc[x])
     print(npezzi[x])
     print(prezzo[x])
     regex = r"(?:da\s+)?([\d.]+)"

     match = re.search(regex, prezzo[x])
     if match:
        numero = match.group(1)
        print(numero)
        prezzof.append(int(numero))
     else:
        prezzof.append(0)
        print(prezzof[x])

#stampa lunghezza liste 
print("\n")
print(nrist)
print(restaurant_address.strip())
print("\n")
doppione = ""
for i in range(len(result)):
    if re.search(r"[a-zA-Z]", result[i]):
        if (result[i]) == doppione:
            print("\n")
        else:
            print(result[i])
    elif re.search(r"\d", result[i]):
        print(result[i], result[i+1])
        doppione = (result[i+1])
print("telefono:",tel)
print("lista nome:",len(nome))
print("lista desc:",len(desc))
print("lista npezzi:",len(npezzi))
print("lista prezzi:",len(prezzo)) #sono stringhe ovvero ci sono anche prezzi come "da 1,00 €" (servirà formattarla in double per poter fare i conti)


# Generazione del codice HTML
html = "<html><body>"
for x in range(len(nome)):
    html += "<h2>" + nome[x] + "</h2>"
    # html += "<p>" + desc[x] + "</p>"
    html += "<p>N. pezzi disponibili: " + str(npezzi[x]) + "</p>"
    html += "<p>Prezzo: €" + str(prezzo[x]) + "</p>"
    html += "<button onclick=\"aggiungiProdotto('" + nome[x] + "', '" + str(prezzof[x]) + "')\">+1</button>"
    html += "<br><br>"

html += "<br><hr><h2>Prodotti aggiunti</h2>"
html += "<div id=\"prodottiAggiunti\"></div>"
html += "<script>"
html += "function aggiungiProdotto(nome, prezzo) {"
html += "   var prodotto = nome + ' (€' + Number(prezzo).toFixed(2) + ')';"
html += "   var box = document.getElementById('prodottiAggiunti');"
html += "   box.innerHTML += '<p>' + prodotto + '</p>';"
html += "}"
html += "</script>"
html += "</body></html>"

# Salvataggio su file
with open("pagina.html", "w") as file:
    file.write(html)
0..0.1 2023-07-31 00:37:12 +02:00			`import re`
			`#import requests`
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00			`from bs4 import BeautifulSoup`
Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00			`#import cloudscraper`
Update 'ScrapeJE.py' 2022-12-01 15:30:55 +01:00			`#import json`
0..0.1 2023-07-31 00:37:12 +02:00			`from selenium import webdriver`
			`from selenium.webdriver.support.ui import WebDriverWait`
			`from selenium.webdriver.support import expected_conditions as EC`
			`from selenium.webdriver.common.by import By`

Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00
			`nome=[]`
			`desc=[]`
Update 'BozzaScrapeJE.py' 2022-09-29 19:07:26 +02:00			`npezzi=[]`
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00			`prezzo=[]`
0..0.1 2023-07-31 00:37:12 +02:00			`prezzof=[]`
Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00			`scripto=[]`

Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00
Update 'ScrapeJE.py' 2022-10-01 03:28:54 +02:00			`#prende l url della pagina justeat del ristorante in input`
Update 'ScrapeJE.py' 2022-12-01 23:26:10 +01:00			`#url = "https://www.justeat.it/restaurants-pizzeria-girasole-bologna/menu"`
Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00			`#url = input('link della pagina justeat del ristorante: ')`
Update 'ScrapeJE.py' 2022-12-01 23:26:10 +01:00			`#input ('inserisci il link della pagina justeat del ristorante: ')`

0..0.1 2023-07-31 00:37:12 +02:00			`# print("\n")`
			`# url = print(Fore.WHITE + Style.DIM + "es https://www.justeat.it/NOME_RISTORANTE/menu" + Style.RESET_ALL)`
			`# url = input("inserisci il link della pagina justeat del ristorante: ")`
Update 'ScrapeJE.py' 2022-12-01 23:26:10 +01:00
Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00
			`#scrape html scavalcando cloudflare`
			`# scraper = cloudscraper.create_scraper(browser={'browser': 'firefox','platform': 'windows','mobile': False})`
			`# page = scraper.get(url).content #usa input manuale`
			`# page = scraper.get("https://www.justeat.it/restaurants-saporedialeppo/menu").content #usa input automatico`
Update 'ScrapeJE.py' 2022-10-01 03:28:54 +02:00
			`#crea il file html`
0..0.1 2023-07-31 00:37:12 +02:00			`# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}`
			`# page = requests.get(url, headers=headers).content`


			`restaurant_url = "https://www.justeat.it/restaurants-saporedialeppo/menu"`
			`restaurant_url = 'https://www.justeat.it/restaurants-gelateria-ice-cream-casalecchio-di-reno-40033/menu'`
			`restaurant_url = 'https://www.justeat.it/restaurants-pizzeria-la-garganica-bologna/menu'`
			`#restaurant_url = 'https://www.justeat.it/restaurants-viavaipizzaekebab/menu'`
			`driver = webdriver.Chrome()`
			`driver.get(restaurant_url)`

			`wait = WebDriverWait(driver, 10)`
			`wait.until(EC.presence_of_element_located((By.CLASS_NAME, "c-menuItems-price--offline")))`


Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00
0..0.1 2023-07-31 00:37:12 +02:00			`page = driver.page_source`




			`with open('JEmenu.html', 'w') as f:`
Update 'ScrapeJE.py' 2022-10-01 03:09:17 +02:00			`f.write(page)`
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00
Update 'ScrapeJE.py' 2022-10-01 03:28:54 +02:00			`#apre e legge il file`
0..0.1 2023-07-31 00:37:12 +02:00			`with open('JEmenu.html', 'r') as f:`
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00			`page = f.read()`

Update 'ScrapeJE.py' 2022-10-01 03:28:54 +02:00			`#parser`
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00			`soup = BeautifulSoup(page, "html.parser")`
Update 'ScrapeJE.py' 2022-10-01 00:22:45 +02:00
			`#Stora nome ristorante`
Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00			`nrist = soup.title.text[8:-32]`
Update 'BozzaScrapeJE.py' 2022-09-30 23:27:10 +02:00
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00
Update 'ScrapeJE.py' 2022-10-01 03:28:54 +02:00
Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00
0..0.1 2023-07-31 00:37:12 +02:00
			`# #Stora telefono del ristorante`

			`# Define the regex pattern`
			`pattern = re.compile(r'allergenPhoneNumber')`

			`# Find all script tags that match the regex pattern`
			`script_tags = soup.find_all('script', text=pattern)`

			`# Define the regex pattern`
			`pattern = re.compile(r'"allergenPhoneNumber":"(\d+)"')`

			`# Search for the phone number in the given string`
			`tel = re.search(pattern, script_tags[0].next)`
			`if tel:`
			`tel = tel.group(1)`


			`#Stato ristorante`
			`restaurant_is_open = menu = soup.find(attrs={"data-js-test":"order-status-wrapper"}).text`
			`restaurant_is_open = restaurant_is_open.replace('\n', ' ')`

			`regex = r" {4,}"`
			`result = re.split(regex, restaurant_is_open)`

			`restaurant_address = soup.find(attrs={"data-js-test":"header-restaurantAddress"}).text`
Update 'ScrapeJE.py' 2022-10-01 03:28:54 +02:00
			`#cicla le schede prodotto`
Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00			`menu = soup.find(attrs={"data-test-id": "menu-item"})`
Update 'BozzaScrapeJE.py' 2022-09-29 19:07:26 +02:00			`for menu in soup.find_all(attrs={"data-test-id": "menu-item"}):`
			`att=menu`
			`#riempie la lista "nome"`
			`for att in menu.find(attrs={"data-test-id": "menu-item-name"}):`
Update 'ScrapeJE.py' 2022-12-01 23:26:10 +01:00			`if att != type(None):`
			`if att != " ":`
			`nome.append(att.lstrip().splitlines()[0])`
			`break`
			`else:`
			`continue`

Update 'BozzaScrapeJE.py' 2022-09-29 19:07:26 +02:00
			`#riempie la lista "desc"`
			`att=menu.find("p", class_="c-menuItems-description")`
			`if att != None:`
			`for att in menu.find("p", class_="c-menuItems-description"):`
			`desc.append(att.lstrip().splitlines()[0])`
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00			`else:`
Update 'BozzaScrapeJE.py' 2022-09-29 19:07:26 +02:00			`desc.append(None)`
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00
Update 'BozzaScrapeJE.py' 2022-09-29 19:07:26 +02:00			`#riempie la lista "prezzo"`
0..0.1 2023-07-31 00:37:12 +02:00			`for att in menu.find(attrs={"data-js-test": "menu-item-price"}):`
Update 'BozzaScrapeJE.py' 2022-09-29 19:07:26 +02:00			`prezzo.append(att.lstrip().splitlines()[0])`
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00
0..0.1 2023-07-31 00:37:12 +02:00

Update 'ScrapeJE.py' 2022-10-01 03:28:54 +02:00			`#riempie la lista "npezzi"`
Update 'BozzaScrapeJE.py' 2022-09-30 23:27:10 +02:00			`att=menu.find_all(attrs={"data-test-id": "menu-item-description"})`
			`if att != None:`
			`if menu.text.find("pezzo") > 0 or menu.text.find("pezzi") > 0:`
			`npezzi.append(menu.text.splitlines()[7].lstrip())`
			`else:`
			`npezzi.append(None)`
			`continue`

0..0.1 2023-07-31 00:37:12 +02:00			`# #stampa liste`
Bozza scrape JustEat menu Scrape nomeProdotto descrizione descrizione2 (es 1 pezzo, 2 pezzi, 3 pezzi, tipo di falafel) prezzo 2022-09-27 18:47:25 +02:00			`for x in range(len(nome)):`
Update 'BozzaScrapeJE.py' 2022-09-30 23:27:10 +02:00			`print("\n")`
			`print(nome[x])`
			`print(desc[x])`
			`print(npezzi[x])`
			`print(prezzo[x])`
0..0.1 2023-07-31 00:37:12 +02:00			`regex = r"(?:da\s+)?([\d.]+)"`

			`match = re.search(regex, prezzo[x])`
			`if match:`
			`numero = match.group(1)`
			`print(numero)`
			`prezzof.append(int(numero))`
			`else:`
			`prezzof.append(0)`
			`print(prezzof[x])`
Update 'BozzaScrapeJE.py' 2022-09-29 19:07:26 +02:00
Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00			`#stampa lunghezza liste`
Update 'ScrapeJE.py' 2022-10-01 00:22:45 +02:00			`print("\n")`
0..0.1 2023-07-31 00:37:12 +02:00			`print(nrist)`
			`print(restaurant_address.strip())`
			`print("\n")`
			`doppione = ""`
			`for i in range(len(result)):`
			`if re.search(r"[a-zA-Z]", result[i]):`
			`if (result[i]) == doppione:`
			`print("\n")`
			`else:`
			`print(result[i])`
			`elif re.search(r"\d", result[i]):`
			`print(result[i], result[i+1])`
			`doppione = (result[i+1])`
Update 'ScrapeJE.py' 2022-12-01 15:30:55 +01:00			`print("telefono:",tel)`
			`print("lista nome:",len(nome))`
Update 'BozzaScrapeJE.py' 2022-09-29 19:07:26 +02:00			`print("lista desc:",len(desc))`
			`print("lista npezzi:",len(npezzi))`
Update 'ScrapeJE.py' 2022-12-01 15:19:56 +01:00			`print("lista prezzi:",len(prezzo)) #sono stringhe ovvero ci sono anche prezzi come "da 1,00 €" (servirà formattarla in double per poter fare i conti)`


0..0.1 2023-07-31 00:37:12 +02:00

			`# Generazione del codice HTML`
			`html = "<html><body>"`
			`for x in range(len(nome)):`
			`html += "<h2>" + nome[x] + "</h2>"`
			`# html += "<p>" + desc[x] + "</p>"`
			`html += "<p>N. pezzi disponibili: " + str(npezzi[x]) + "</p>"`
			`html += "<p>Prezzo: €" + str(prezzo[x]) + "</p>"`
			`html += "<button onclick=\"aggiungiProdotto('" + nome[x] + "', '" + str(prezzof[x]) + "')\">+1</button>"`
			`html += "<br><br>"`

			`html += "<br><hr><h2>Prodotti aggiunti</h2>"`
			`html += "<div id=\"prodottiAggiunti\"></div>"`
			`html += "<script>"`
			`html += "function aggiungiProdotto(nome, prezzo) {"`
			`html += " var prodotto = nome + ' (€' + Number(prezzo).toFixed(2) + ')';"`
			`html += " var box = document.getElementById('prodottiAggiunti');"`
			`html += " box.innerHTML += '<p>' + prodotto + '</p>';"`
			`html += "}"`
			`html += "</script>"`
			`html += "</body></html>"`

			`# Salvataggio su file`
			`with open("pagina.html", "w") as file:`
			`file.write(html)`