姬長信(Redy)

python – 解析向下滚动的整个网页的html代…


from bs4 import BeautifulSoup
import urllib,sys
reload(sys)
sys.setdefaultencoding("utf-8")
r = urllib.urlopen('https://twitter.com/ndtv').read()
soup = BeautifulSoup(r)

这将使我不是整个网页向下滚动我想要的只是其中的一部分.

编辑:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,sys,requests
reload(sys)
sys.setdefaultencoding("utf-8")

class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False

def return_html_code(url):
    driver = webdriver.Firefox()
    driver.maximize_window()
    driver.get(url)
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        number_of_tweets = len(tweets)
        print number_of_tweets
        driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
    html_full_source=driver.page_source
    driver.close()
    return html_full_source


url='https://twitter.com/thecoolstacks'
#using selenium browser
html_source=return_html_code(url)
soup_selenium = BeautifulSoup(html_source)
print soup_selenium
text_tweet=[]
alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
for tweet in alltweets_selenium:
    #Text of tweet
    html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
    text_tweet.append(''.join(html_tweet[0].findAll(text=True)))    
print text_tweet

预期产出:

import requests from bs4 import BeautifulSoup      url='https://twitter.com/thecoolstacks' 
req = requests.get(url) 
soup = BeautifulSoup(req.content) 
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'}) 
print alltweets[0]