1717
1818import re
1919import time
20- import urllib
21- import urllib .request
2220
21+ import httpx
2322from loguru import logger
2423
2524__name__ = 'yandexenum'
26- url_opener = urllib .request .FancyURLopener
27-
28-
29- class AppURLopener (url_opener ):
30- version = """Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
31- (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"""
3225
3326
3427def scrape_yandex (dom ):
@@ -37,25 +30,33 @@ def scrape_yandex(dom):
3730 """
3831 results = []
3932 searches = ['1' , '2' , '3' , '4' , '5' , '10' , '20' , '30' ]
40- urllib ._urlopener = AppURLopener ()
4133
42- for _ in searches :
43- url = 'https://yandex.com/search/?text=site%3A' + dom
44- try :
45- sock = urllib .request .urlopen (url , timeout = 10 )
46- data = sock .read ().decode ('utf-8' )
47- sock .close ()
48- except Exception as e :
49- logger .error (e )
50- return []
34+ headers = {
35+ 'User-Agent' : (
36+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
37+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
38+ 'Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'
39+ )
40+ }
41+
42+ with httpx .Client (headers = headers ) as client :
43+ for _ in searches :
44+ url = 'https://yandex.com/search/?text=site%3A' + dom
45+ try :
46+ response = client .get (url , timeout = 10.0 )
47+ data = response .text
48+ except Exception as e :
49+ logger .error (e )
50+ return []
5151
52- if re .search ('enter_captcha_value' , data ):
53- logger .error ("Yandex has detected the search as 'bot activity, stopping search..." )
54- return unique (results )
52+ if re .search ('enter_captcha_value' , data ):
53+ logger .error ("Yandex has detected the search as 'bot activity, stopping search..." )
54+ return unique (results )
5555
56- results .extend (re .findall (r'([a-zA-Z0-9\-\.]+' + dom + ')/?' , data ))
56+ safe_dom = re .escape (dom )
57+ results .extend (re .findall (r'([a-zA-Z0-9\-\.]+' + safe_dom + ')/?' , data ))
5758
58- time .sleep (10 )
59+ time .sleep (10 )
5960
6061 return unique (results )
6162
0 commit comments